Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

misc Zarr V3 bug fixes: open_group, open_consolidated and MemoryStoreV3 #1006

Merged
merged 8 commits into from
Apr 20, 2022
2 changes: 1 addition & 1 deletion zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from zarr.hierarchy import Group, group, open_group
from zarr.n5 import N5Store, N5FSStore
from zarr.storage import (ABSStore, DBMStore, DictStore, DirectoryStore,
LMDBStore, LRUStoreCache, MemoryStore, MongoDBStore,
KVStore, LMDBStore, LRUStoreCache, MemoryStore, MongoDBStore,
NestedDirectoryStore, RedisStore, SQLiteStore,
TempStore, ZipStore)
from zarr.sync import ProcessSynchronizer, ThreadSynchronizer
Expand Down
4 changes: 3 additions & 1 deletion zarr/convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,9 @@ def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", **
"""

# normalize parameters
store = normalize_store_arg(store, storage_options=kwargs.get("storage_options"), mode=mode)
zarr_version = kwargs.get('zarr_version', None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps more for a follow-up, but if this is getting extracted does it make sense to list as a parameter?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I notice that's not happening with storage_options either, so it probably is more of a higher-level aesthetic decision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am fine doing it either way.

store = normalize_store_arg(store, storage_options=kwargs.get("storage_options"), mode=mode,
zarr_version=zarr_version)
if mode not in {'r', 'r+'}:
raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}"
.format(mode))
Expand Down
3 changes: 2 additions & 1 deletion zarr/hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,7 +1308,8 @@ def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=N
if chunk_store is not None:
chunk_store = _normalize_store_arg(chunk_store,
storage_options=storage_options,
mode=mode)
mode=mode,
zarr_version=zarr_version)
if not getattr(chunk_store, '_store_version', DEFAULT_ZARR_VERSION) == zarr_version:
raise ValueError(
"zarr_version of store and chunk_store must match"
Expand Down
15 changes: 14 additions & 1 deletion zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3045,7 +3045,20 @@ def rename(self, src_path: Path, dst_path: Path):
src_parent, src_key = self._get_parent(base + src_path)
dst_parent, dst_key = self._require_parent(base + dst_path)

dst_parent[dst_key] = src_parent.pop(src_key)
if src_key in src_parent:
dst_parent[dst_key] = src_parent.pop(src_key)

if base == meta_root:
# check for and move corresponding metadata
sfx = _get_metadata_suffix(self)
src_meta = src_key + '.array' + sfx
if src_meta in src_parent:
dst_meta = dst_key + '.array' + sfx
dst_parent[dst_meta] = src_parent.pop(src_meta)
src_meta = src_key + '.group' + sfx
if src_meta in src_parent:
dst_meta = dst_key + '.group' + sfx
dst_parent[dst_meta] = src_parent.pop(src_meta)
any_renamed = True
any_renamed = _rename_metadata_v3(self, src_path, dst_path) or any_renamed
if not any_renamed:
Expand Down
183 changes: 78 additions & 105 deletions zarr/tests/test_convenience.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,23 +202,30 @@ def test_tree(zarr_version):
assert str(zarr.tree(g1)) == str(g1.tree())


# TODO: consolidated metadata currently only supported for v2

@pytest.mark.parametrize('zarr_version', [2, 3])
@pytest.mark.parametrize('with_chunk_store', [False, True], ids=['default', 'with_chunk_store'])
def test_consolidate_metadata(with_chunk_store, zarr_version):

if zarr_version == 2:
MemoryStoreClass = MemoryStore
path = ''
else:
MemoryStoreClass = MemoryStoreV3
path = 'dataset'

@pytest.mark.parametrize('stores_from_path', [False, True])
def test_consolidate_metadata(with_chunk_store, zarr_version, stores_from_path):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outwith this PR, but this reminds me if v3 would make standardizing consolidated metadata easier. See zarr-developers/zarr-specs#113

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also /zarr-developers/zarr-specs#136 related to where this metadata would be stored

# setup initial data
store = MemoryStoreClass()
chunk_store = MemoryStoreClass() if with_chunk_store else None
z = group(store, chunk_store=chunk_store, path=path)
if stores_from_path:
store = tempfile.mkdtemp()
atexit.register(atexit_rmtree, store)
if with_chunk_store:
chunk_store = tempfile.mkdtemp()
atexit.register(atexit_rmtree, chunk_store)
else:
chunk_store = None
version_kwarg = {'zarr_version': zarr_version}
else:
if zarr_version == 2:
store = MemoryStore()
chunk_store = MemoryStore() if with_chunk_store else None
elif zarr_version == 3:
store = MemoryStoreV3()
chunk_store = MemoryStoreV3() if with_chunk_store else None
version_kwarg = {}
path = 'dataset' if zarr_version == 3 else None
z = group(store, chunk_store=chunk_store, path=path, **version_kwarg)
z.create_group('g1')
g2 = z.create_group('g2')
g2.attrs['hello'] = 'world'
Expand All @@ -229,64 +236,81 @@ def test_consolidate_metadata(with_chunk_store, zarr_version):
arr[:] = 1.0
assert 16 == arr.nchunks_initialized

if stores_from_path:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems fine to have in the test, but this is an interesting block, @grlee77. Can you explain it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are tests below that do del store[key], but when the store is just a path string instead of a BaseStore class this will not work. So, the test uses z._store directly. I think I had originally just tried deleting the keys directly from the consolidated store, z, instead of accessing the private z._store, but this was not possible because ConsolidatedMetadataStore classes are read-only.

# get the actual store class for use with consolidate_metadata
store_class = z._store
else:
store_class = store

if zarr_version == 3:
# error on v3 if path not provided
with pytest.raises(ValueError):
consolidate_metadata(store, path=None)
consolidate_metadata(store_class, path=None)

with pytest.raises(ValueError):
consolidate_metadata(store, path='')
consolidate_metadata(store_class, path='')

# perform consolidation
out = consolidate_metadata(store, path=path)
out = consolidate_metadata(store_class, path=path)
assert isinstance(out, Group)
assert ['g1', 'g2'] == list(out)
if zarr_version == 2:
assert isinstance(out._store, ConsolidatedMetadataStore)
assert '.zmetadata' in store
meta_keys = ['.zgroup',
'g1/.zgroup',
'g2/.zgroup',
'g2/.zattrs',
'g2/arr/.zarray',
'g2/arr/.zattrs']
else:
assert isinstance(out._store, ConsolidatedMetadataStoreV3)
assert 'meta/root/consolidated/.zmetadata' in store
meta_keys = ['zarr.json',
meta_root + 'dataset.group.json',
meta_root + 'dataset/g1.group.json',
meta_root + 'dataset/g2.group.json',
meta_root + 'dataset/g2/arr.array.json',
'meta/root/consolidated.group.json']
for key in meta_keys:
del store[key]
if not stores_from_path:
if zarr_version == 2:
assert isinstance(out._store, ConsolidatedMetadataStore)
assert '.zmetadata' in store
meta_keys = ['.zgroup',
'g1/.zgroup',
'g2/.zgroup',
'g2/.zattrs',
'g2/arr/.zarray',
'g2/arr/.zattrs']
else:
assert isinstance(out._store, ConsolidatedMetadataStoreV3)
assert 'meta/root/consolidated/.zmetadata' in store
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, outwith this PR but back to zarr-developers/zarr-specs#113, could the v3 spec here simply replace all the individual JSON files solely with the consolidated metadata?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it could be done, but haven't tried it. All store types are already somewhat consolidated for v3 since the entire meta folder tree is independent of the data one. I suppose if there are many arrays/groups it may still be preferred to further consolidate into one file, though? I would defer to those using the feature on what is the desired behavior.

meta_keys = ['zarr.json',
meta_root + 'dataset.group.json',
meta_root + 'dataset/g1.group.json',
meta_root + 'dataset/g2.group.json',
meta_root + 'dataset/g2/arr.array.json',
'meta/root/consolidated.group.json']
for key in meta_keys:
del store[key]

# open consolidated
z2 = open_consolidated(store, chunk_store=chunk_store, path=path)
z2 = open_consolidated(store, chunk_store=chunk_store, path=path, **version_kwarg)
assert ['g1', 'g2'] == list(z2)
assert 'world' == z2.g2.attrs['hello']
assert 1 == z2.g2.arr.attrs['data']
assert (z2.g2.arr[:] == 1.0).all()
assert 16 == z2.g2.arr.nchunks
assert 16 == z2.g2.arr.nchunks_initialized

# tests del/write on the store
if zarr_version == 2:
cmd = ConsolidatedMetadataStore(store)
with pytest.raises(PermissionError):
del cmd['.zgroup']
with pytest.raises(PermissionError):
cmd['.zgroup'] = None
if stores_from_path:
# path string is note a BaseStore subclass so cannot be used to
# initialize a ConsolidatedMetadataStore.
if zarr_version == 2:
with pytest.raises(ValueError):
cmd = ConsolidatedMetadataStore(store)
elif zarr_version == 3:
with pytest.raises(ValueError):
cmd = ConsolidatedMetadataStoreV3(store)
else:
cmd = ConsolidatedMetadataStoreV3(store)
with pytest.raises(PermissionError):
del cmd[meta_root + 'dataset.group.json']
with pytest.raises(PermissionError):
cmd[meta_root + 'dataset.group.json'] = None
# tests del/write on the store
if zarr_version == 2:
cmd = ConsolidatedMetadataStore(store)
with pytest.raises(PermissionError):
del cmd['.zgroup']
with pytest.raises(PermissionError):
cmd['.zgroup'] = None
else:
cmd = ConsolidatedMetadataStoreV3(store)
with pytest.raises(PermissionError):
del cmd[meta_root + 'dataset.group.json']
with pytest.raises(PermissionError):
cmd[meta_root + 'dataset.group.json'] = None

# test getsize on the store
assert isinstance(getsize(cmd), Integral)
# test getsize on the store
assert isinstance(getsize(cmd), Integral)

# test new metadata are not writeable
with pytest.raises(PermissionError):
Expand Down Expand Up @@ -316,62 +340,11 @@ def test_consolidate_metadata(with_chunk_store, zarr_version):

# make sure keyword arguments are passed through without error
open_consolidated(
store, chunk_store=chunk_store, path=path, cache_attrs=True, synchronizer=None
store, chunk_store=chunk_store, path=path, cache_attrs=True, synchronizer=None,
**version_kwarg,
)


def test_consolidated_with_chunk_store():
# setup initial data
store = MemoryStore()
chunk_store = MemoryStore()
z = group(store, chunk_store=chunk_store)
z.create_group('g1')
g2 = z.create_group('g2')
g2.attrs['hello'] = 'world'
arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8')
assert 16 == arr.nchunks
assert 0 == arr.nchunks_initialized
arr.attrs['data'] = 1
arr[:] = 1.0
assert 16 == arr.nchunks_initialized

# perform consolidation
out = consolidate_metadata(store)
assert isinstance(out, Group)
assert '.zmetadata' in store
for key in ['.zgroup',
'g1/.zgroup',
'g2/.zgroup',
'g2/.zattrs',
'g2/arr/.zarray',
'g2/arr/.zattrs']:
del store[key]
# open consolidated
z2 = open_consolidated(store, chunk_store=chunk_store)
assert ['g1', 'g2'] == list(z2)
assert 'world' == z2.g2.attrs['hello']
assert 1 == z2.g2.arr.attrs['data']
assert (z2.g2.arr[:] == 1.0).all()
assert 16 == z2.g2.arr.nchunks
assert 16 == z2.g2.arr.nchunks_initialized

# test the data are writeable
z2.g2.arr[:] = 2
assert (z2.g2.arr[:] == 2).all()

# test invalid modes
with pytest.raises(ValueError):
open_consolidated(store, mode='a', chunk_store=chunk_store)
with pytest.raises(ValueError):
open_consolidated(store, mode='w', chunk_store=chunk_store)
with pytest.raises(ValueError):
open_consolidated(store, mode='w-', chunk_store=chunk_store)

# make sure keyword arguments are passed through without error
open_consolidated(store, cache_attrs=True, synchronizer=None,
chunk_store=chunk_store)


@pytest.mark.parametrize("options", (
{"dimension_separator": "/"},
{"dimension_separator": "."},
Expand Down
6 changes: 3 additions & 3 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2720,8 +2720,8 @@ def test_array_init(self):
assert isinstance(a, Array)
assert (100,) == a.shape
assert (10,) == a.chunks
assert path == a.path # TODO: should this include meta/root?
assert '/' + path == a.name # TODO: should this include meta/root?
assert path == a.path
assert '/' + path == a.name
assert 'bar' == a.basename
assert store is a.store
assert "968dccbbfc0139f703ead2fd1d503ad6e44db307" == a.hexdigest()
Expand Down Expand Up @@ -2772,7 +2772,7 @@ def test_nbytes_stored(self):
z[:] = 42
expect_nbytes_stored = sum(buffer_size(v) for k, v in z.store.items() if k != 'zarr.json')
assert expect_nbytes_stored == z.nbytes_stored
assert z.nchunks_initialized == 10 # TODO: added temporarily for testing, can remove
assert z.nchunks_initialized == 10

# mess with store
if not isinstance(z.store, (LRUStoreCacheV3, FSStoreV3)):
Expand Down
Loading