Skip to content

Fix DirectoryStore #773

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 20, 2021
22 changes: 22 additions & 0 deletions fixture/flat/.zarray
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"chunks": [
2,
2
],
"compressor": {
"blocksize": 0,
"clevel": 5,
"cname": "lz4",
"id": "blosc",
"shuffle": 1
},
"dtype": "<i8",
"fill_value": 0,
"filters": null,
"order": "C",
"shape": [
2,
2
],
"zarr_format": 2
}
Binary file added fixture/flat/0.0
Binary file not shown.
23 changes: 23 additions & 0 deletions fixture/nested/.zarray
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"chunks": [
2,
2
],
"compressor": {
"blocksize": 0,
"clevel": 5,
"cname": "lz4",
"id": "blosc",
"shuffle": 1
},
"dimension_separator": "/",
"dtype": "<i8",
"fill_value": 0,
"filters": null,
"order": "C",
"shape": [
2,
2
],
"zarr_format": 2
}
Binary file added fixture/nested/0/0
Binary file not shown.
2 changes: 1 addition & 1 deletion zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1952,7 +1952,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None):
return self._encode_chunk(chunk)

def _chunk_key(self, chunk_coords):
return self._key_prefix + '.'.join(map(str, chunk_coords))
return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain how this change is compatible with FSStore._normalize_key? https://github.com/zarr-developers/zarr-python/blob/master/zarr/storage.py#L1076

FSStore._normalize_key assumes that all chunk keys are formatted foo/bar/0.0.0 -- this assumption is the basis of splitting the chunk key into a prefix and a chunk ID via key.split('/'). As I understand it, this change breaks this assumption.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reading the flat and nested fixtures from this repo (zarr.open(f"file:///tmp/{x}")[:]) with some sloppy debugging in place shows:

|               | Array._chunk_key | FSStore._normalize_keys |
|---------------|------------------|-------------------------|
| master:nested |  (0, 0) --> 0.0  |       0.0 --> 0.0       |
| master:flat   |  (0, 0) --> 0.0  |       0.0 --> 0.0       |
| PR:nested     |  (0, 0) --> 0/0  |       0/0 --> 0/0       |
| PR:flat       |  (0, 0) --> 0.0  |       0.0 --> 0.0       |

which likely points to some logic in FSStore being ripe for removal since the Store is basically just accepting what what the Array has detected. Now, how it is that that's working with your PR, I still haven't figured out.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, as your test shows this is fine for FSStore (and maybe we don't need this code in the store at all if the chunk keys come pre-normalized). But this situation is dire for N5Stores, which need to be able to re-order the chunk keys before writing to storage.


def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None):
# decompress
Expand Down
2 changes: 1 addition & 1 deletion zarr/n5.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def invert_chunk_coords(key):
last_segment = segments[-1]
if _prog_ckey.match(last_segment):
coords = list(last_segment.split('.'))
last_segment = '.'.join(coords[::-1])
last_segment = '/'.join(coords[::-1])
segments = segments[:-1] + [last_segment]
key = '/'.join(segments)
return key
Expand Down
62 changes: 25 additions & 37 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,12 +948,37 @@ def dir_path(self, path=None):
return dir_path

def listdir(self, path=None):
return self._dimension_separator == "/" and \
self._nested_listdir(path) or self._flat_listdir(path)

def _flat_listdir(self, path=None):
dir_path = self.dir_path(path)
if os.path.isdir(dir_path):
return sorted(os.listdir(dir_path))
else:
return []

def _nested_listdir(self, path=None):
children = self._flat_listdir(path=path)
if array_meta_key in children:
# special handling of directories containing an array to map nested chunk
# keys back to standard chunk keys
new_children = []
root_path = self.dir_path(path)
for entry in children:
entry_path = os.path.join(root_path, entry)
if _prog_number.match(entry) and os.path.isdir(entry_path):
for dir_path, _, file_names in os.walk(entry_path):
for file_name in file_names:
file_path = os.path.join(dir_path, file_name)
rel_path = file_path.split(root_path + os.path.sep)[1]
new_children.append(rel_path.replace(os.path.sep, '.'))
else:
new_children.append(entry)
return sorted(new_children)
else:
return children

def rename(self, src_path, dst_path):
store_src_path = normalize_storage_path(src_path)
store_dst_path = normalize_storage_path(dst_path)
Expand Down Expand Up @@ -1315,49 +1340,12 @@ def __init__(self, path, normalize_keys=False, dimension_separator="/"):
"NestedDirectoryStore only supports '/' as dimension_separator")
self._dimension_separator = dimension_separator

def __getitem__(self, key):
key = _nested_map_ckey(key)
return super().__getitem__(key)

def __setitem__(self, key, value):
key = _nested_map_ckey(key)
super().__setitem__(key, value)

def __delitem__(self, key):
key = _nested_map_ckey(key)
super().__delitem__(key)

def __contains__(self, key):
key = _nested_map_ckey(key)
return super().__contains__(key)

def __eq__(self, other):
return (
isinstance(other, NestedDirectoryStore) and
self.path == other.path
)

def listdir(self, path=None):
children = super().listdir(path=path)
if array_meta_key in children:
# special handling of directories containing an array to map nested chunk
# keys back to standard chunk keys
new_children = []
root_path = self.dir_path(path)
for entry in children:
entry_path = os.path.join(root_path, entry)
if _prog_number.match(entry) and os.path.isdir(entry_path):
for dir_path, _, file_names in os.walk(entry_path):
for file_name in file_names:
file_path = os.path.join(dir_path, file_name)
rel_path = file_path.split(root_path + os.path.sep)[1]
new_children.append(rel_path.replace(os.path.sep, '.'))
else:
new_children.append(entry)
return sorted(new_children)
else:
return children


# noinspection PyPep8Naming
class ZipStore(MutableMapping):
Expand Down
75 changes: 75 additions & 0 deletions zarr/tests/test_dim_separator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pytest
from numpy.testing import assert_array_equal

import zarr
from zarr.core import Array
from zarr.storage import (DirectoryStore, NestedDirectoryStore, FSStore)
from zarr.tests.util import have_fsspec


@pytest.fixture(params=("static_nested",
"static_flat",
"directory_nested",
"directory_flat",
"directory_default",
"nesteddirectory_nested",
"nesteddirectory_default",
"fs_nested",
"fs_flat",
"fs_default"))
def dataset(tmpdir, request):
"""
Generate a variety of different Zarrs using
different store implementations as well as
different dimension_separator arguments.
"""

loc = tmpdir.join("dim_sep_test.zarr")
which = request.param
kwargs = {}

if which.startswith("static"):
if which.endswith("nested"):
return "fixture/nested"
else:
return "fixture/flat"

if which.startswith("directory"):
store_class = DirectoryStore
elif which.startswith("nested"):
store_class = NestedDirectoryStore
else:
if have_fsspec is False:
pytest.skip("no fsspec")
store_class = FSStore
kwargs["mode"] = "w"
kwargs["auto_mkdir"] = True

if which.endswith("nested"):
kwargs["dimension_separator"] = "/"
elif which.endswith("flat"):
kwargs["dimension_separator"] = "."

store = store_class(str(loc), **kwargs)
zarr.creation.array(store=store, data=[[1, 2], [3, 4]])
return str(loc)


def verify(array):
assert_array_equal(array[:], [[1, 2], [3, 4]])


def test_open(dataset):
verify(zarr.open(dataset))


def test_fsstore(dataset):
verify(Array(store=FSStore(dataset)))


def test_directory(dataset):
verify(zarr.Array(store=DirectoryStore(dataset)))


def test_nested(dataset):
verify(Array(store=NestedDirectoryStore(dataset)))
18 changes: 11 additions & 7 deletions zarr/tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,12 +804,16 @@ def test_pickle(self):

class TestDirectoryStore(StoreTests):

def create_store(self, normalize_keys=False, **kwargs):
skip_if_nested_chunks(**kwargs)

def create_store(self,
normalize_keys=False,
dimension_separator=".",
**kwargs):
path = tempfile.mkdtemp()
atexit.register(atexit_rmtree, path)
store = DirectoryStore(path, normalize_keys=normalize_keys, **kwargs)
store = DirectoryStore(path,
normalize_keys=normalize_keys,
dimension_separator=dimension_separator,
**kwargs)
return store

def test_filesystem_path(self):
Expand Down Expand Up @@ -1150,10 +1154,10 @@ def test_chunk_nesting(self):
# any path where last segment looks like a chunk key gets special handling
store['0.0'] = b'xxx'
assert b'xxx' == store['0.0']
assert b'xxx' == store['0/0']
# assert b'xxx' == store['0/0']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did we loose there? Doesn't the store normalise "." -> "/" anyway?

Copy link
Member Author

@joshmoore joshmoore Jun 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to describe in the commit message on e183566, at least to the best of my understanding:

Previous tests (now commented out) used logic in the store classes to convert "0/0" keys into "0.0" keys, forcing the store to be aware of array details. This tries to swap the logic so that stores are responsible for passing dimension separator values down to the arrays only. Since arrays can also get the dimension_separator value from a .zarray file they are now in charge.

store['foo/10.20.30'] = b'yyy'
assert b'yyy' == store['foo/10.20.30']
assert b'yyy' == store['foo/10/20/30']
# assert b'yyy' == store['foo/10/20/30']
store['42'] = b'zzz'
assert b'zzz' == store['42']

Expand Down Expand Up @@ -1198,7 +1202,7 @@ def test_chunk_nesting(self):
store['0.0'] = b'xxx'
assert '0.0' in store
assert b'xxx' == store['0.0']
assert b'xxx' == store['0/0']
# assert b'xxx' == store['0/0']
store['foo/10.20.30'] = b'yyy'
assert 'foo/10.20.30' in store
assert b'yyy' == store['foo/10.20.30']
Expand Down