11from __future__ import annotations
22
33import asyncio
4+ import warnings
45from dataclasses import dataclass , replace
56from enum import Enum
67from functools import cached_property
7- from typing import TYPE_CHECKING
8+ from typing import TYPE_CHECKING , Final , Literal , NotRequired , TypedDict
89
910import numcodecs
1011from numcodecs .blosc import Blosc
1112from packaging .version import Version
1213
1314from zarr .abc .codec import BytesBytesCodec
1415from zarr .core .buffer .cpu import as_numpy_array_wrapper
15- from zarr .core .common import JSON , parse_enum , parse_named_configuration
16+ from zarr .core .common import JSON , NamedRequiredConfig , parse_enum , parse_named_configuration
1617from zarr .core .dtype .common import HasItemSize
18+ from zarr .errors import ZarrDeprecationWarning
1719
1820if TYPE_CHECKING :
1921 from typing import Self
2022
2123 from zarr .core .array_spec import ArraySpec
2224 from zarr .core .buffer import Buffer
2325
26+ Shuffle = Literal ["noshuffle" , "shuffle" , "bitshuffle" ]
27+ """The shuffle values permitted for the blosc codec"""
28+
29+ SHUFFLE : Final = ("noshuffle" , "shuffle" , "bitshuffle" )
30+
31+ CName = Literal ["lz4" , "lz4hc" , "blosclz" , "snappy" , "zlib" , "zstd" ]
32+ """The codec identifiers used in the blosc codec """
33+
34+
35+ class BloscConfigV2 (TypedDict ):
36+ """Configuration for the V2 Blosc codec"""
37+
38+ cname : CName
39+ clevel : int
40+ shuffle : int
41+ blocksize : int
42+ typesize : NotRequired [int ]
43+
44+
45+ class BloscConfigV3 (TypedDict ):
46+ """Configuration for the V3 Blosc codec"""
47+
48+ cname : CName
49+ clevel : int
50+ shuffle : Shuffle
51+ blocksize : int
52+ typesize : int
53+
54+
55+ class BloscJSON_V3 (NamedRequiredConfig [Literal ["blosc" ], BloscConfigV3 ]):
56+ """
57+ The JSON form of the Blosc codec in Zarr V3.
58+ """
59+
2460
2561class BloscShuffle (Enum ):
2662 """
@@ -86,29 +122,186 @@ def parse_blocksize(data: JSON) -> int:
86122
87123@dataclass (frozen = True )
88124class BloscCodec (BytesBytesCodec ):
89- """blosc codec"""
125+ """
126+ Blosc compression codec for zarr.
127+
128+ Blosc is a high-performance compressor optimized for binary data. It uses a
129+ combination of blocking, shuffling, and fast compression algorithms to achieve
130+ excellent compression ratios and speed.
131+
132+ Attributes
133+ ----------
134+ tunable_attrs : set of {'typesize', 'shuffle'}
135+ Attributes that will be automatically tuned when `evolve_from_array_spec()`
136+ is called. By default, contains {'typesize', 'shuffle'}. When either
137+ `typesize` or `shuffle` is explicitly set to None during initialization,
138+ the corresponding attribute is added to this set (if not already present),
139+ allowing it to be overridden based on the array's dtype.
140+ is_fixed_size : bool
141+ Always False for Blosc codec, as compression produces variable-sized output.
142+ typesize : int
143+ The data type size in bytes used for shuffle filtering.
144+ cname : BloscCname
145+ The compression algorithm being used (lz4, lz4hc, blosclz, snappy, zlib, or zstd).
146+ clevel : int
147+ The compression level (0-9).
148+ shuffle : BloscShuffle
149+ The shuffle filter mode (noshuffle, shuffle, or bitshuffle).
150+ blocksize : int
151+ The size of compressed blocks in bytes (0 for automatic).
152+
153+ Parameters
154+ ----------
155+ typesize : int, optional
156+ The data type size in bytes. This affects how the shuffle filter processes
157+ the data. If None (deprecated), defaults to 1 and the attribute is marked
158+ as tunable. Default: 1.
159+ cname : BloscCname or {'lz4', 'lz4hc', 'blosclz', 'snappy', 'zlib', 'zstd'}, optional
160+ The compression algorithm to use. Default: 'zstd'.
161+ clevel : int, optional
162+ The compression level, from 0 (no compression) to 9 (maximum compression).
163+ Higher values provide better compression at the cost of speed. Default: 5.
164+ shuffle : BloscShuffle or {'noshuffle', 'shuffle', 'bitshuffle'}, optional
165+ The shuffle filter to apply before compression:
166+
167+ - 'noshuffle': No shuffling
168+ - 'shuffle': Byte shuffling (better for typesize > 1)
169+ - 'bitshuffle': Bit shuffling (better for typesize == 1)
170+
171+ If None (deprecated), defaults to 'bitshuffle' and the attribute is marked
172+ as tunable. Default: 'bitshuffle'.
173+ blocksize : int, optional
174+ The requested size of compressed blocks in bytes. A value of 0 means
175+ automatic block size selection. Default: 0.
176+ tunable_attrs : set of {'typesize', 'shuffle'}, optional
177+ Names of attributes that can be automatically adjusted by
178+ `evolve_from_array_spec()`. This allows the codec to adapt its parameters
179+ based on the array's data type when the array is created. If None, defaults
180+ to {'typesize', 'shuffle'}.
181+
182+ Notes
183+ -----
184+ **Tunable Attributes Logic**:
185+
186+ The `tunable_attrs` mechanism allows codec parameters to be automatically
187+ adjusted based on the array's data type:
188+
189+ 1. **Initialization**: During `__init__`, if `tunable_attrs` is None, it
190+ defaults to {'typesize', 'shuffle'}. This means both attributes can be
191+ tuned by default.
192+
193+ 2. **Deprecated None Values**: If `typesize` or `shuffle` is explicitly set
194+ to None:
195+
196+ - A deprecation warning is issued
197+ - The parameter is set to a default value (1 for typesize, 'bitshuffle'
198+ for shuffle)
199+ - The attribute name is added to `tunable_attrs`
200+
201+ 3. **Evolution**: When `evolve_from_array_spec()` is called (typically during
202+ array creation), it creates a new codec instance with updated parameters:
203+
204+ - If 'typesize' is in `tunable_attrs`, it's set to the array dtype's
205+ item size
206+ - If 'shuffle' is in `tunable_attrs`, it's set to 'bitshuffle' if
207+ item_size == 1, otherwise 'shuffle'
208+
209+ 4. **Explicit Values**: If you explicitly set `typesize=4` or
210+ `shuffle='noshuffle'`, these values are NOT in `tunable_attrs` by default
211+ and will not be changed by `evolve_from_array_spec()`.
212+
213+ **Thread Safety**: This codec sets `numcodecs.blosc.use_threads = False` at
214+ module import time to avoid threading issues in asyncio contexts.
215+
216+ Examples
217+ --------
218+ Create a Blosc codec with default settings:
219+
220+ >>> codec = BloscCodec()
221+ >>> codec.typesize
222+ 1
223+ >>> codec.shuffle
224+ <BloscShuffle.bitshuffle: 'bitshuffle'>
225+
226+ Create a codec with specific compression settings:
227+
228+ >>> codec = BloscCodec(cname='zstd', clevel=9, shuffle='shuffle')
229+ >>> codec.cname
230+ <BloscCname.zstd: 'zstd'>
231+
232+ Use deprecated None values (will be tuned automatically):
233+
234+ >>> codec = BloscCodec(typesize=None, shuffle=None) # doctest: +SKIP
235+ DeprecationWarning: The typesize parameter was set to None...
236+ >>> 'typesize' in codec.tunable_attrs
237+ True
238+ >>> 'shuffle' in codec.tunable_attrs
239+ True
240+
241+ Prevent automatic tuning:
242+
243+ >>> codec = BloscCodec(typesize=4, shuffle='noshuffle', tunable_attrs=set())
244+ >>> codec.tunable_attrs
245+ set()
246+
247+ See Also
248+ --------
249+ BloscShuffle : Enum for shuffle filter options
250+ BloscCname : Enum for compression algorithm options
251+ """
90252
253+ tunable_attrs : set [Literal ["typesize" , "shuffle" ]]
91254 is_fixed_size = False
92255
93- typesize : int | None
94- cname : BloscCname = BloscCname . zstd
95- clevel : int = 5
96- shuffle : BloscShuffle | None = BloscShuffle . noshuffle
97- blocksize : int = 0
256+ typesize : int
257+ cname : BloscCname
258+ clevel : int
259+ shuffle : BloscShuffle
260+ blocksize : int
98261
99262 def __init__ (
100263 self ,
101264 * ,
102- typesize : int | None = None ,
103- cname : BloscCname | str = BloscCname .zstd ,
265+ typesize : int | None = 1 ,
266+ cname : BloscCname | CName = BloscCname .zstd ,
104267 clevel : int = 5 ,
105- shuffle : BloscShuffle | str | None = None ,
268+ shuffle : BloscShuffle | Shuffle | None = "bitshuffle" ,
106269 blocksize : int = 0 ,
270+ tunable_attrs : set [Literal ["typesize" , "shuffle" ]] | None = None ,
107271 ) -> None :
108- typesize_parsed = parse_typesize (typesize ) if typesize is not None else None
272+ # set default value of tunable_attrs
273+ if tunable_attrs is None :
274+ object .__setattr__ (self , "tunable_attrs" , {"typesize" , "shuffle" })
275+ else :
276+ object .__setattr__ (self , "tunable_attrs" , tunable_attrs )
277+
278+ # If typesize was set to None: warn, replace it with a valid typesize
279+ # and flag the typesize attribute as safe to replace later
280+ if typesize is None :
281+ msg = (
282+ "The typesize parameter was set to None. This is deprecated. "
283+ "Provide a positive int for the typesize parameter instead. "
284+ )
285+ warnings .warn (msg , ZarrDeprecationWarning , stacklevel = 2 )
286+ typesize = 1
287+ self .tunable_attrs .update ({"typesize" })
288+
289+ # If shuffle was set to None: warn, replace it with a valid typesize
290+ # and flag the shuffle attribute as safe to replace later
291+ if shuffle is None :
292+ msg = (
293+ "The shuffle parameter was set to None. This is deprecated. "
294+ "Provide a valid shuffle literal string -- "
295+ f"one of { SHUFFLE !r} -- instead."
296+ )
297+ warnings .warn (msg , ZarrDeprecationWarning , stacklevel = 2 )
298+ shuffle = BloscShuffle .bitshuffle
299+ self .tunable_attrs .update ({"shuffle" })
300+
301+ typesize_parsed = parse_typesize (typesize )
109302 cname_parsed = parse_enum (cname , BloscCname )
110303 clevel_parsed = parse_clevel (clevel )
111- shuffle_parsed = parse_enum (shuffle , BloscShuffle ) if shuffle is not None else None
304+ shuffle_parsed = parse_enum (shuffle , BloscShuffle )
112305 blocksize_parsed = parse_blocksize (blocksize )
113306
114307 object .__setattr__ (self , "typesize" , typesize_parsed )
@@ -123,11 +316,7 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
123316 return cls (** configuration_parsed ) # type: ignore[arg-type]
124317
125318 def to_dict (self ) -> dict [str , JSON ]:
126- if self .typesize is None :
127- raise ValueError ("`typesize` needs to be set for serialization." )
128- if self .shuffle is None :
129- raise ValueError ("`shuffle` needs to be set for serialization." )
130- return {
319+ result : BloscJSON_V3 = {
131320 "name" : "blosc" ,
132321 "configuration" : {
133322 "typesize" : self .typesize ,
@@ -137,15 +326,22 @@ def to_dict(self) -> dict[str, JSON]:
137326 "blocksize" : self .blocksize ,
138327 },
139328 }
329+ return result # type: ignore[return-value]
140330
141331 def evolve_from_array_spec (self , array_spec : ArraySpec ) -> Self :
332+ """
333+ Create a new codec with typesize and shuffle parameters adjusted
334+ according to the size of each element in the data type
335+ associated with array_spec. Parameters are only updated if they were set to
336+ None when self.__init__ was called.
337+ """
142338 item_size = 1
143339 if isinstance (array_spec .dtype , HasItemSize ):
144340 item_size = array_spec .dtype .item_size
145341 new_codec = self
146- if new_codec . typesize is None :
342+ if " typesize" in self . tunable_attrs :
147343 new_codec = replace (new_codec , typesize = item_size )
148- if new_codec . shuffle is None :
344+ if " shuffle" in self . tunable_attrs :
149345 new_codec = replace (
150346 new_codec ,
151347 shuffle = (BloscShuffle .bitshuffle if item_size == 1 else BloscShuffle .shuffle ),
@@ -155,15 +351,13 @@ def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
155351
156352 @cached_property
157353 def _blosc_codec (self ) -> Blosc :
158- if self .shuffle is None :
159- raise ValueError ("`shuffle` needs to be set for decoding and encoding." )
160354 map_shuffle_str_to_int = {
161355 BloscShuffle .noshuffle : 0 ,
162356 BloscShuffle .shuffle : 1 ,
163357 BloscShuffle .bitshuffle : 2 ,
164358 }
165- config_dict = {
166- "cname" : self .cname .name ,
359+ config_dict : BloscConfigV2 = {
360+ "cname" : self .cname .name , # type: ignore[typeddict-item]
167361 "clevel" : self .clevel ,
168362 "shuffle" : map_shuffle_str_to_int [self .shuffle ],
169363 "blocksize" : self .blocksize ,
0 commit comments