diff --git a/zarr/__init__.py b/zarr/__init__.py index 8079bab071..7558ce77de 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -9,7 +9,7 @@ zeros_like) from zarr.errors import CopyError, MetadataError from zarr.hierarchy import Group, group, open_group -from zarr.n5 import N5Store +from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, DBMStore, DictStore, DirectoryStore, LMDBStore, LRUStoreCache, MemoryStore, MongoDBStore, NestedDirectoryStore, RedisStore, SQLiteStore, diff --git a/zarr/n5.py b/zarr/n5.py index 45e2cdda95..797558fa2d 100644 --- a/zarr/n5.py +++ b/zarr/n5.py @@ -11,7 +11,8 @@ from numcodecs.registry import get_codec, register_codec from .meta import ZARR_FORMAT, json_dumps, json_loads -from .storage import NestedDirectoryStore, _prog_ckey, _prog_number +from .storage import FSStore +from .storage import NestedDirectoryStore, _prog_ckey, _prog_number, normalize_storage_path from .storage import array_meta_key as zarr_array_meta_key from .storage import attrs_key as zarr_attrs_key from .storage import group_meta_key as zarr_group_meta_key @@ -281,12 +282,298 @@ def _contains_attrs(self, path): return len(attrs) > 0 +class N5FSStore(FSStore): + """Implentation of the N5 format (https://github.com/saalfeldlab/n5) using `fsspec`, + which allows storage on a variety of filesystems. Based on `zarr.N5Store`. + Parameters + ---------- + path : string + Location of directory to use as the root of the storage hierarchy. + normalize_keys : bool, optional + If True, all store keys will be normalized to use lower case characters + (e.g. 'foo' and 'FOO' will be treated as equivalent). This can be + useful to avoid potential discrepancies between case-senstive and + case-insensitive file system. Default value is False. + + Examples + -------- + Store a single array:: + + >>> import zarr + >>> store = zarr.N5FSStore('data/array.n5', auto_mkdir=True) + >>> z = zarr.zeros((10, 10), chunks=(5, 5), store=store, overwrite=True) + >>> z[...] = 42 + + Store a group:: + + >>> store = zarr.N5FSStore('data/group.n5', auto_mkdir=True) + >>> root = zarr.group(store=store, overwrite=True) + >>> foo = root.create_group('foo') + >>> bar = foo.zeros('bar', shape=(10, 10), chunks=(5, 5)) + >>> bar[...] = 42 + + Notes + ----- + This is an experimental feature. + Safe to write in multiple threads or processes. + + Be advised that the `_dimension_separator` property of this store + (and arrays it creates) is ".", but chunks saved by this store will + in fact be "/" separated, as proscribed by the N5 format. + + This is counter-intuitive (to say the least), but not arbitrary. + Chunks in N5 format are stored with reversed dimension order + relative to Zarr chunks: a chunk of a 3D Zarr array would be stored + on a file system as `/0/1/2`, but in N5 the same chunk would be + stored as `/2/1/0`. Therefore, stores targeting N5 must intercept + chunk keys and flip the order of the dimensions before writing to + storage, and this procedure requires chunk keys with "." separated + dimensions, hence the Zarr arrays targeting N5 have the deceptive + "." dimension separator. + """ + _array_meta_key = 'attributes.json' + _group_meta_key = 'attributes.json' + _attrs_key = 'attributes.json' + + def __init__(self, *args, **kwargs): + if 'dimension_separator' in kwargs: + kwargs.pop('dimension_separator') + warnings.warn('Keyword argument `dimension_separator` will be ignored') + dimension_separator = "." + super().__init__(*args, dimension_separator=dimension_separator, **kwargs) + + def _swap_separator(self, key): + segments = list(key.split('/')) + if segments: + last_segment = segments[-1] + if _prog_ckey.match(last_segment): + coords = list(last_segment.split('.')) + last_segment = '/'.join(coords[::-1]) + segments = segments[:-1] + [last_segment] + key = '/'.join(segments) + return key + + def _normalize_key(self, key): + if is_chunk_key(key): + key = invert_chunk_coords(key) + + key = normalize_storage_path(key).lstrip("/") + if key: + *bits, end = key.split("/") + + if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): + end = end.replace(".", "/") + key = "/".join(bits + [end]) + return key.lower() if self.normalize_keys else key + + def __getitem__(self, key): + if key.endswith(zarr_group_meta_key): + + key = key.replace(zarr_group_meta_key, self._group_meta_key) + value = group_metadata_to_zarr(self._load_n5_attrs(key)) + + return json_dumps(value) + + elif key.endswith(zarr_array_meta_key): + + key = key.replace(zarr_array_meta_key, self._array_meta_key) + value = array_metadata_to_zarr(self._load_n5_attrs(key)) + + return json_dumps(value) + + elif key.endswith(zarr_attrs_key): + + key = key.replace(zarr_attrs_key, self._attrs_key) + value = attrs_to_zarr(self._load_n5_attrs(key)) + + if len(value) == 0: + raise KeyError(key) + else: + return json_dumps(value) + + elif is_chunk_key(key): + key = self._swap_separator(key) + + return super().__getitem__(key) + + def __setitem__(self, key, value): + if key.endswith(zarr_group_meta_key): + + key = key.replace(zarr_group_meta_key, self._group_meta_key) + + n5_attrs = self._load_n5_attrs(key) + n5_attrs.update(**group_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_array_meta_key): + + key = key.replace(zarr_array_meta_key, self._array_meta_key) + + n5_attrs = self._load_n5_attrs(key) + n5_attrs.update(**array_metadata_to_n5(json_loads(value))) + + value = json_dumps(n5_attrs) + + elif key.endswith(zarr_attrs_key): + + key = key.replace(zarr_attrs_key, self._attrs_key) + + n5_attrs = self._load_n5_attrs(key) + zarr_attrs = json_loads(value) + + for k in n5_keywords: + if k in zarr_attrs.keys(): + raise ValueError( + "Can not set attribute %s, this is a reserved N5 keyword" % k + ) + + # replace previous user attributes + for k in list(n5_attrs.keys()): + if k not in n5_keywords: + del n5_attrs[k] + + # add new user attributes + n5_attrs.update(**zarr_attrs) + + value = json_dumps(n5_attrs) + + elif is_chunk_key(key): + key = self._swap_separator(key) + + super().__setitem__(key, value) + + def __delitem__(self, key): + + if key.endswith(zarr_group_meta_key): # pragma: no cover + key = key.replace(zarr_group_meta_key, self._group_meta_key) + elif key.endswith(zarr_array_meta_key): # pragma: no cover + key = key.replace(zarr_array_meta_key, self._array_meta_key) + elif key.endswith(zarr_attrs_key): # pragma: no cover + key = key.replace(zarr_attrs_key, self._attrs_key) + elif is_chunk_key(key): + key = self._swap_separator(key) + + super().__delitem__(key) + + def __contains__(self, key): + if key.endswith(zarr_group_meta_key): + + key = key.replace(zarr_group_meta_key, self._group_meta_key) + if key not in self: + return False + # group if not a dataset (attributes do not contain 'dimensions') + return "dimensions" not in self._load_n5_attrs(key) + + elif key.endswith(zarr_array_meta_key): + + key = key.replace(zarr_array_meta_key, self._array_meta_key) + # array if attributes contain 'dimensions' + return "dimensions" in self._load_n5_attrs(key) + + elif key.endswith(zarr_attrs_key): + + key = key.replace(zarr_attrs_key, self._attrs_key) + return self._contains_attrs(key) + + elif is_chunk_key(key): + key = self._swap_separator(key) + + return super().__contains__(key) + + def __eq__(self, other): + return isinstance(other, N5FSStore) and self.path == other.path + + def listdir(self, path=None): + if path is not None: + path = invert_chunk_coords(path) + + # We can't use NestedDirectoryStore's listdir, as it requires + # array_meta_key to be present in array directories, which this store + # doesn't provide. + children = super().listdir(path=path) + if self._is_array(path): + + # replace n5 attribute file with respective zarr attribute files + children.remove(self._array_meta_key) + children.append(zarr_array_meta_key) + if self._contains_attrs(path): + children.append(zarr_attrs_key) + + # special handling of directories containing an array to map + # inverted nested chunk keys back to standard chunk keys + new_children = [] + root_path = self.dir_path(path) + for entry in children: + entry_path = os.path.join(root_path, entry) + if _prog_number.match(entry) and self.fs.isdir(entry_path): + for file_name in self.fs.find(entry_path): + file_path = os.path.join(root_path, file_name) + rel_path = file_path.split(root_path)[1] + new_child = rel_path.lstrip('/').replace('/', ".") + new_children.append(invert_chunk_coords(new_child)) + else: + new_children.append(entry) + return sorted(new_children) + + elif self._is_group(path): + + # replace n5 attribute file with respective zarr attribute files + children.remove(self._group_meta_key) + children.append(zarr_group_meta_key) + if self._contains_attrs(path): # pragma: no cover + children.append(zarr_attrs_key) + return sorted(children) + else: + return children + + def _load_n5_attrs(self, path): + try: + s = super().__getitem__(path) + return json_loads(s) + except KeyError: + return {} + + def _is_group(self, path): + + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + n5_attrs = self._load_n5_attrs(attrs_key) + return len(n5_attrs) > 0 and "dimensions" not in n5_attrs + + def _is_array(self, path): + + if path is None: + attrs_key = self._attrs_key + else: + attrs_key = os.path.join(path, self._attrs_key) + + return "dimensions" in self._load_n5_attrs(attrs_key) + + def _contains_attrs(self, path): + + if path is None: + attrs_key = self._attrs_key + else: + if not path.endswith(self._attrs_key): + attrs_key = os.path.join(path, self._attrs_key) + else: # pragma: no cover + attrs_key = path + + attrs = attrs_to_zarr(self._load_n5_attrs(attrs_key)) + return len(attrs) > 0 + + def is_chunk_key(key): + rv = False segments = list(key.split('/')) if segments: last_segment = segments[-1] - return _prog_ckey.match(last_segment) - return False # pragma: no cover + rv = _prog_ckey.match(last_segment) + return rv def invert_chunk_coords(key): @@ -373,6 +660,7 @@ def array_metadata_to_zarr(array_metadata): array_metadata['fill_value'] = 0 # also if None was requested array_metadata['order'] = 'C' array_metadata['filters'] = [] + array_metadata['dimension_separator'] = '.' compressor_config = array_metadata['compressor'] compressor_config = compressor_config_to_zarr(compressor_config) diff --git a/zarr/storage.py b/zarr/storage.py index 6ca6271dbf..395551687f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1065,22 +1065,28 @@ class FSStore(MutableMapping): Separator placed between the dimensions of a chunk. storage_options : passed to the fsspec implementation """ + _array_meta_key = array_meta_key + _group_meta_key = group_meta_key + _attrs_key = attrs_key - _META_KEYS = (attrs_key, group_meta_key, array_meta_key) - - def __init__(self, url, normalize_keys=False, key_separator=None, + def __init__(self, url, normalize_keys=True, key_separator=None, mode='w', exceptions=(KeyError, PermissionError, IOError), dimension_separator=None, **storage_options): import fsspec self.normalize_keys = normalize_keys + + protocol, _ = fsspec.core.split_protocol(url) + # set auto_mkdir to True for local file system + if protocol in (None, "file") and not storage_options.get("auto_mkdir"): + storage_options["auto_mkdir"] = True + self.map = fsspec.get_mapper(url, **storage_options) self.fs = self.map.fs # for direct operations self.path = self.fs._strip_protocol(url) self.mode = mode self.exceptions = exceptions - # For backwards compatibility. Guaranteed to be non-None if key_separator is not None: dimension_separator = key_separator @@ -1091,7 +1097,6 @@ def __init__(self, url, normalize_keys=False, key_separator=None, # Pass attributes to array creation self._dimension_separator = dimension_separator - if self.fs.exists(self.path) and not self.fs.isdir(self.path): raise FSPathExistNotDir(url) @@ -1100,7 +1105,7 @@ def _normalize_key(self, key): if key: *bits, end = key.split('/') - if end not in FSStore._META_KEYS: + if end not in (self._array_meta_key, self._group_meta_key, self._attrs_key): end = end.replace('.', self.key_separator) key = '/'.join(bits + [end]) @@ -1178,7 +1183,7 @@ def listdir(self, path=None): if self.key_separator != "/": return children else: - if array_meta_key in children: + if self._array_meta_key in children: # special handling of directories containing an array to map nested chunk # keys back to standard chunk keys new_children = [] diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 9043a32a51..be2feffe8a 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -19,7 +19,7 @@ from zarr.core import Array from zarr.meta import json_loads -from zarr.n5 import N5Store, n5_keywords +from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( ABSStore, DBMStore, @@ -1984,12 +1984,12 @@ def test_compressors(self): def expected(self): return [ - 'c6b83adfad999fbd865057531d749d87cf138f58', - 'a3d6d187536ecc3a9dd6897df55d258e2f52f9c5', - 'ec2e008525ae09616dbc1d2408cbdb42532005c8', - 'b63f031031dcd5248785616edcb2d6fe68203c28', - '0cfc673215a8292a87f3c505e2402ce75243c601', - ] + '4e9cf910000506455f82a70938a272a3fce932e5', + 'f9d4cbf1402901f63dea7acf764d2546e4b6aa38', + '1d8199f5f7b70d61aa0d29cc375212c3df07d50a', + '874880f91aa6736825584509144afe6b06b0c05c', + 'e2258fedc74752196a8c8383db49e27193c995e2', + ] def test_hexdigest(self): found = [] @@ -2018,6 +2018,22 @@ def test_hexdigest(self): assert self.expected() == found +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithN5FSStore(TestArrayWithN5Store): + + @staticmethod + def create_array(read_only=False, **kwargs): + path = mkdtemp() + atexit.register(shutil.rmtree, path) + store = N5FSStore(path) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + kwargs.setdefault('compressor', Zlib(1)) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs) + + class TestArrayWithDBMStore(TestArray): @staticmethod diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 4296ee6364..1412ec2099 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -23,7 +23,7 @@ from zarr.meta import (ZARR_FORMAT, decode_array_metadata, decode_group_metadata, encode_array_metadata, encode_group_metadata) -from zarr.n5 import N5Store +from zarr.n5 import N5Store, N5FSStore from zarr.storage import (ABSStore, ConsolidatedMetadataStore, DBMStore, DictStore, DirectoryStore, LMDBStore, LRUStoreCache, MemoryStore, MongoDBStore, NestedDirectoryStore, @@ -900,13 +900,20 @@ def mock_walker_no_slash(_path): @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestFSStore(StoreTests): - def create_store(self, normalize_keys=False, dimension_separator="."): - path = tempfile.mkdtemp() - atexit.register(atexit_rmtree, path) + def create_store(self, normalize_keys=False, + dimension_separator=".", + path=None, + **kwargs): + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStore( path, normalize_keys=normalize_keys, - dimension_separator=dimension_separator) + dimension_separator=dimension_separator, + **kwargs) return store def test_init_array(self): @@ -937,8 +944,9 @@ def test_dimension_separator(self): def test_complex(self): path1 = tempfile.mkdtemp() path2 = tempfile.mkdtemp() - store = FSStore("simplecache::file://" + path1, - simplecache={"same_names": True, "cache_storage": path2}) + store = self.create_store(path="simplecache::file://" + path1, + simplecache={"same_names": True, + "cache_storage": path2}) assert not store assert not os.listdir(path1) assert not os.listdir(path2) @@ -949,6 +957,20 @@ def test_complex(self): assert store["foo"] == b"hello" assert 'foo' in os.listdir(path2) + def test_deep_ndim(self): + import zarr + + store = self.create_store() + foo = zarr.open_group(store=store) + bar = foo.create_group("bar") + baz = bar.create_dataset("baz", + shape=(4, 4, 4), + chunks=(2, 2, 2), + dtype="i8") + baz[:] = 1 + assert set(store.listdir()) == set([".zgroup", "bar"]) + assert foo["bar"]["baz"][(0, 0, 0)] == 1 + def test_not_fsspec(self): import zarr path = tempfile.mkdtemp() @@ -979,10 +1001,10 @@ def test_create(self): def test_read_only(self): path = tempfile.mkdtemp() atexit.register(atexit_rmtree, path) - store = FSStore(path) + store = self.create_store(path=path) store['foo'] = b"bar" - store = FSStore(path, mode='r') + store = self.create_store(path=path, mode='r') with pytest.raises(PermissionError): store['foo'] = b"hex" @@ -1000,11 +1022,11 @@ def test_read_only(self): filepath = os.path.join(path, "foo") with pytest.raises(ValueError): - FSStore(filepath, mode='r') + self.create_store(path=filepath, mode='r') def test_eq(self): - store1 = FSStore("anypath") - store2 = FSStore("anypath") + store1 = self.create_store(path="anypath") + store2 = self.create_store(path="anypath") assert store1 == store2 @pytest.mark.usefixtures("s3") @@ -1187,7 +1209,7 @@ def test_value_error(self): class TestN5Store(TestNestedDirectoryStore): def create_store(self, normalize_keys=False): - path = tempfile.mkdtemp(suffix='.n5') + path = tempfile.mkdtemp() atexit.register(atexit_rmtree, path) store = N5Store(path, normalize_keys=normalize_keys) return store @@ -1228,6 +1250,7 @@ def test_init_array(self): assert default_compressor.get_config() == compressor_config # N5Store always has a fill value of 0 assert meta['fill_value'] == 0 + assert meta['dimension_separator'] == '.' def test_init_array_path(self): path = 'foo/bar' @@ -1297,6 +1320,109 @@ def test_filters(self): init_array(store, shape=1000, chunks=100, filters=filters) +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestN5FSStore(TestFSStore): + def create_store(self, normalize_keys=False, path=None, **kwargs): + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = N5FSStore(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_equal(self): + store_a = self.create_store() + store_b = N5FSStore(store_a.path) + assert store_a == store_b + + # This is copied wholesale from the N5Store tests. The same test could + # be run by making TestN5FSStore inherit from both TestFSStore and + # TestN5Store, but a direct copy is arguably more explicit. + def test_chunk_nesting(self): + store = self.create_store() + store['0.0'] = b'xxx' + assert '0.0' in store + assert b'xxx' == store['0.0'] + # assert b'xxx' == store['0/0'] + store['foo/10.20.30'] = b'yyy' + assert 'foo/10.20.30' in store + assert b'yyy' == store['foo/10.20.30'] + # N5 reverses axis order + assert b'yyy' == store['foo/30/20/10'] + store['42'] = b'zzz' + assert '42' in store + assert b'zzz' == store['42'] + + def test_init_array(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100) + + # check metadata + assert array_meta_key in store + meta = decode_array_metadata(store[array_meta_key]) + assert ZARR_FORMAT == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunks'] + assert np.dtype(None) == meta['dtype'] + # N5Store wraps the actual compressor + compressor_config = meta['compressor']['compressor_config'] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta['fill_value'] == 0 + assert meta['dimension_separator'] == '.' + + def test_init_array_path(self): + path = 'foo/bar' + store = self.create_store() + init_array(store, shape=1000, chunks=100, path=path) + + # check metadata + key = path + '/' + array_meta_key + assert key in store + meta = decode_array_metadata(store[key]) + assert ZARR_FORMAT == meta['zarr_format'] + assert (1000,) == meta['shape'] + assert (100,) == meta['chunks'] + assert np.dtype(None) == meta['dtype'] + # N5Store wraps the actual compressor + compressor_config = meta['compressor']['compressor_config'] + assert default_compressor.get_config() == compressor_config + # N5Store always has a fill value of 0 + assert meta['fill_value'] == 0 + + def test_init_array_compat(self): + store = self.create_store() + init_array(store, shape=1000, chunks=100, compressor='none') + meta = decode_array_metadata(store[array_meta_key]) + # N5Store wraps the actual compressor + compressor_config = meta['compressor']['compressor_config'] + assert compressor_config is None + + def test_init_array_overwrite(self): + self._test_init_array_overwrite('C') + + def test_init_array_overwrite_path(self): + self._test_init_array_overwrite_path('C') + + def test_init_array_overwrite_chunk_store(self): + self._test_init_array_overwrite_chunk_store('C') + + def test_init_group_overwrite(self): + self._test_init_group_overwrite('C') + + def test_init_group_overwrite_path(self): + self._test_init_group_overwrite_path('C') + + def test_init_group_overwrite_chunk_store(self): + self._test_init_group_overwrite_chunk_store('C') + + def test_dimension_separator(self): + + with pytest.warns(UserWarning, match='dimension_separator'): + self.create_store(dimension_separator='/') + + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestNestedFSStore(TestNestedDirectoryStore):