Skip to content

Commit 1a2eeed

Browse files
abergouAttila Bergoujoshmoore
authored
Fix structured arrays that contain objects #806 (#813)
* Fix structured arrays that contain objects #806 * Ensures that the fill value of structured arrays that contain objects is encoded using object_codec. * Add test and fix-up to ensure compatibility * Update docs/release.rst * Fixup unit testss Don't specify protocol: makes unit tests pass in python3.7 N5 doesn't support object codecs * Fixup linting error Explicitly handle an error condition that can only happen if encode_fill_value or decode_fill_value are directly called. * Add encode/decode tests for codecov * Explicitly import Pickle from numcodecs for mypy * Migrate test from #702 With thanks to @ombschervister * Install types-setuptools for CI Co-authored-by: Attila Bergou <[email protected]> Co-authored-by: Josh Moore <[email protected]> Co-authored-by: jmoore <[email protected]>
1 parent e095fe5 commit 1a2eeed

File tree

8 files changed

+174
-15
lines changed

8 files changed

+174
-15
lines changed

docs/release.rst

+11-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,17 @@ Release notes
66
Unreleased
77
----------
88

9+
.. _release_2.9.4:
10+
11+
2.9.4
12+
-----
13+
14+
Bug fixes
15+
~~~~~~~~~
16+
17+
* Fix structured arrays that contain objects
18+
By :user: `Attila Bergou <abergou>`; :issue: `806`
19+
920
.. _release_2.9.3:
1021

1122
2.9.3
@@ -31,7 +42,6 @@ Maintenance
3142
* Correct conda-forge deployment of Zarr by fixing some Zarr tests.
3243
By :user:`Ben Williams <benjaminhwilliams>`; :issue:`821`.
3344

34-
3545
.. _release_2.9.1:
3646

3747
2.9.1
@@ -92,7 +102,6 @@ Maintenance
92102
* TST: add missing assert in test_hexdigest.
93103
By :user:`Greggory Lee <grlee77>`; :issue:`801`.
94104

95-
96105
.. _release_2.8.3:
97106

98107
2.8.3

requirements_dev_optional.txt

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ ipytree==0.2.1
99
azure-storage-blob==12.8.1 # pyup: ignore
1010
redis==3.5.3
1111
types-redis
12+
types-setuptools
1213
pymongo==3.12.0
1314
# optional test requirements
1415
tox==3.24.3

zarr/codecs.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# flake8: noqa
22
from numcodecs import *
3-
from numcodecs import get_codec, Blosc, Zlib, Delta, AsType, BZ2
3+
from numcodecs import get_codec, Blosc, Pickle, Zlib, Delta, AsType, BZ2
44
from numcodecs.registry import codec_registry

zarr/meta.py

+31-7
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,14 @@ def decode_array_metadata(s: Union[MappingType, str]) -> MappingType[str, Any]:
4040
# extract array metadata fields
4141
try:
4242
dtype = decode_dtype(meta['dtype'])
43-
fill_value = decode_fill_value(meta['fill_value'], dtype)
43+
44+
if dtype.hasobject:
45+
import numcodecs
46+
object_codec = numcodecs.get_codec(meta['filters'][0])
47+
else:
48+
object_codec = None
49+
50+
fill_value = decode_fill_value(meta['fill_value'], dtype, object_codec)
4451
meta = dict(
4552
zarr_format=meta['zarr_format'],
4653
shape=tuple(meta['shape']),
@@ -66,14 +73,18 @@ def encode_array_metadata(meta: MappingType[str, Any]) -> bytes:
6673
dtype, sdshape = dtype.subdtype
6774

6875
dimension_separator = meta.get('dimension_separator')
69-
76+
if dtype.hasobject:
77+
import numcodecs
78+
object_codec = numcodecs.get_codec(meta['filters'][0])
79+
else:
80+
object_codec = None
7081
meta = dict(
7182
zarr_format=ZARR_FORMAT,
7283
shape=meta['shape'] + sdshape,
7384
chunks=meta['chunks'],
7485
dtype=encode_dtype(dtype),
7586
compressor=meta['compressor'],
76-
fill_value=encode_fill_value(meta['fill_value'], dtype),
87+
fill_value=encode_fill_value(meta['fill_value'], dtype, object_codec),
7788
order=meta['order'],
7889
filters=meta['filters'],
7990
)
@@ -132,10 +143,17 @@ def encode_group_metadata(meta=None) -> bytes:
132143
}
133144

134145

135-
def decode_fill_value(v, dtype):
146+
def decode_fill_value(v, dtype, object_codec=None):
136147
# early out
137148
if v is None:
138149
return v
150+
if dtype.kind == 'V' and dtype.hasobject:
151+
if object_codec is None:
152+
raise ValueError('missing object_codec for object array')
153+
v = base64.standard_b64decode(v)
154+
v = object_codec.decode(v)
155+
v = np.array(v, dtype=dtype)[()]
156+
return v
139157
if dtype.kind == 'f':
140158
if v == 'NaN':
141159
return np.nan
@@ -171,10 +189,16 @@ def decode_fill_value(v, dtype):
171189
return np.array(v, dtype=dtype)[()]
172190

173191

174-
def encode_fill_value(v: Any, dtype: np.dtype) -> Any:
192+
def encode_fill_value(v: Any, dtype: np.dtype, object_codec: Any = None) -> Any:
175193
# early out
176194
if v is None:
177195
return v
196+
if dtype.kind == 'V' and dtype.hasobject:
197+
if object_codec is None:
198+
raise ValueError('missing object_codec for object array')
199+
v = object_codec.encode(v)
200+
v = str(base64.standard_b64encode(v), 'ascii')
201+
return v
178202
if dtype.kind == 'f':
179203
if np.isnan(v):
180204
return 'NaN'
@@ -190,8 +214,8 @@ def encode_fill_value(v: Any, dtype: np.dtype) -> Any:
190214
return bool(v)
191215
elif dtype.kind in 'c':
192216
c = cast(np.complex128, np.dtype(complex).type())
193-
v = (encode_fill_value(v.real, c.real.dtype),
194-
encode_fill_value(v.imag, c.imag.dtype))
217+
v = (encode_fill_value(v.real, c.real.dtype, object_codec),
218+
encode_fill_value(v.imag, c.imag.dtype, object_codec))
195219
return v
196220
elif dtype.kind in 'SV':
197221
v = str(base64.standard_b64encode(v), 'ascii')

zarr/storage.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -423,7 +423,7 @@ def _init_array_metadata(
423423
filters_config = []
424424

425425
# deal with object encoding
426-
if dtype == object:
426+
if dtype.hasobject:
427427
if object_codec is None:
428428
if not filters:
429429
# there are no filters so we can be sure there is no object codec

zarr/tests/test_core.py

+59
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from numcodecs.compat import ensure_bytes, ensure_ndarray
1616
from numcodecs.tests.common import greetings
1717
from numpy.testing import assert_array_almost_equal, assert_array_equal
18+
from pkg_resources import parse_version
1819

1920
from zarr.core import Array
2021
from zarr.meta import json_loads
@@ -1362,6 +1363,44 @@ def test_object_codec_warnings(self):
13621363
if hasattr(z.store, 'close'):
13631364
z.store.close()
13641365

1366+
@unittest.skipIf(parse_version(np.__version__) < parse_version('1.14.0'),
1367+
"unsupported numpy version")
1368+
def test_structured_array_contain_object(self):
1369+
1370+
if "PartialRead" in self.__class__.__name__:
1371+
pytest.skip("partial reads of object arrays not supported")
1372+
1373+
# ----------- creation --------------
1374+
1375+
structured_dtype = [('c_obj', object), ('c_int', int)]
1376+
a = np.array([(b'aaa', 1),
1377+
(b'bbb', 2)], dtype=structured_dtype)
1378+
1379+
# zarr-array with structured dtype require object codec
1380+
with pytest.raises(ValueError):
1381+
self.create_array(shape=a.shape, dtype=structured_dtype)
1382+
1383+
# create zarr-array by np-array
1384+
za = self.create_array(shape=a.shape, dtype=structured_dtype, object_codec=Pickle())
1385+
za[:] = a
1386+
1387+
# must be equal
1388+
assert_array_equal(a, za[:])
1389+
1390+
# ---------- indexing ---------------
1391+
1392+
assert za[0] == a[0]
1393+
1394+
za[0] = (b'ccc', 3)
1395+
za[1:2] = np.array([(b'ddd', 4)], dtype=structured_dtype) # ToDo: not work with list
1396+
assert_array_equal(za[:], np.array([(b'ccc', 3), (b'ddd', 4)], dtype=structured_dtype))
1397+
1398+
za['c_obj'] = [b'eee', b'fff']
1399+
za['c_obj', 0] = b'ggg'
1400+
assert_array_equal(za[:], np.array([(b'ggg', 3), (b'fff', 4)], dtype=structured_dtype))
1401+
assert za['c_obj', 0] == b'ggg'
1402+
assert za[1, 'c_int'] == 4
1403+
13651404
def test_iteration_exceptions(self):
13661405
# zero d array
13671406
a = np.array(1, dtype=int)
@@ -1490,6 +1529,14 @@ def test_attributes(self):
14901529
if hasattr(a.store, 'close'):
14911530
a.store.close()
14921531

1532+
def test_structured_with_object(self):
1533+
a = self.create_array(fill_value=(0.0, None),
1534+
shape=10,
1535+
chunks=10,
1536+
dtype=[('x', float), ('y', object)],
1537+
object_codec=Pickle())
1538+
assert tuple(a[0]) == (0.0, None)
1539+
14931540

14941541
class TestArrayWithPath(TestArray):
14951542

@@ -1893,6 +1940,14 @@ def test_object_arrays_danger(self):
18931940
# Cannot hacking out object codec as N5 doesn't allow object codecs
18941941
pass
18951942

1943+
def test_structured_with_object(self):
1944+
# Cannot hacking out object codec as N5 doesn't allow object codecs
1945+
pass
1946+
1947+
def test_structured_array_contain_object(self):
1948+
# Cannot hacking out object codec as N5 doesn't allow object codecs
1949+
pass
1950+
18961951
def test_attrs_n5_keywords(self):
18971952
z = self.create_array(shape=(1050,), chunks=100, dtype='i4')
18981953
for k in n5_keywords:
@@ -2326,6 +2381,10 @@ def test_object_arrays_danger(self):
23262381
# skip this one, cannot use delta with objects
23272382
pass
23282383

2384+
def test_structured_array_contain_object(self):
2385+
# skip this one, cannot use delta on structured array
2386+
pass
2387+
23292388

23302389
# custom store, does not support getsize()
23312390
class CustomMapping(object):

zarr/tests/test_meta.py

+69-2
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
import numpy as np
55
import pytest
66

7-
from zarr.codecs import Blosc, Delta, Zlib
7+
from zarr.codecs import Blosc, Delta, Pickle, Zlib
88
from zarr.errors import MetadataError
99
from zarr.meta import (ZARR_FORMAT, decode_array_metadata, decode_dtype,
1010
decode_group_metadata, encode_array_metadata,
11-
encode_dtype)
11+
encode_dtype, encode_fill_value, decode_fill_value)
12+
from zarr.util import normalize_dtype, normalize_fill_value
1213

1314

1415
def assert_json_equal(expect, actual):
@@ -435,3 +436,69 @@ def test_decode_group():
435436
}''' % (ZARR_FORMAT - 1)
436437
with pytest.raises(MetadataError):
437438
decode_group_metadata(b)
439+
440+
441+
@pytest.mark.parametrize(
442+
"fill_value,dtype,object_codec,result",
443+
[
444+
(
445+
(0.0, None),
446+
[('x', float), ('y', object)],
447+
Pickle(),
448+
True, # Pass
449+
),
450+
(
451+
(0.0, None),
452+
[('x', float), ('y', object)],
453+
None,
454+
False, # Fail
455+
),
456+
],
457+
)
458+
def test_encode_fill_value(fill_value, dtype, object_codec, result):
459+
460+
# normalize metadata (copied from _init_array_metadata)
461+
dtype, object_codec = normalize_dtype(dtype, object_codec)
462+
dtype = dtype.base
463+
fill_value = normalize_fill_value(fill_value, dtype)
464+
465+
# test
466+
if result:
467+
encode_fill_value(fill_value, dtype, object_codec)
468+
else:
469+
with pytest.raises(ValueError):
470+
encode_fill_value(fill_value, dtype, object_codec)
471+
472+
473+
@pytest.mark.parametrize(
474+
"fill_value,dtype,object_codec,result",
475+
[
476+
(
477+
(0.0, None),
478+
[('x', float), ('y', object)],
479+
Pickle(),
480+
True, # Pass
481+
),
482+
(
483+
(0.0, None),
484+
[('x', float), ('y', object)],
485+
None,
486+
False, # Fail
487+
),
488+
],
489+
)
490+
def test_decode_fill_value(fill_value, dtype, object_codec, result):
491+
492+
# normalize metadata (copied from _init_array_metadata)
493+
dtype, object_codec = normalize_dtype(dtype, object_codec)
494+
dtype = dtype.base
495+
fill_value = normalize_fill_value(fill_value, dtype)
496+
497+
# test
498+
if result:
499+
v = encode_fill_value(fill_value, dtype, object_codec)
500+
decode_fill_value(v, dtype, object_codec)
501+
else:
502+
with pytest.raises(ValueError):
503+
# No encoding is possible
504+
decode_fill_value(fill_value, dtype, object_codec)

zarr/util.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -253,10 +253,9 @@ def normalize_dimension_separator(sep: Optional[str]) -> Optional[str]:
253253

254254
def normalize_fill_value(fill_value, dtype: np.dtype):
255255

256-
if fill_value is None:
256+
if fill_value is None or dtype.hasobject:
257257
# no fill value
258258
pass
259-
260259
elif fill_value == 0:
261260
# this should be compatible across numpy versions for any array type, including
262261
# structured arrays

0 commit comments

Comments
 (0)