diff --git a/zarr/_storage/absstore.py b/zarr/_storage/absstore.py index 98ac6328b1..cc41018f9e 100644 --- a/zarr/_storage/absstore.py +++ b/zarr/_storage/absstore.py @@ -3,7 +3,7 @@ import warnings from numcodecs.compat import ensure_bytes from zarr.util import normalize_storage_path -from zarr._storage.store import Store +from zarr._storage.store import _get_metadata_suffix, data_root, meta_root, Store, StoreV3 __doctest_requires__ = { ('ABSStore', 'ABSStore.*'): ['azure.storage.blob'], @@ -209,3 +209,57 @@ def getsize(self, path=None): def clear(self): self.rmdir() + + +class ABSStoreV3(ABSStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ABSStoreV3) and + self.client == other.client and + self.prefix == other.prefix + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def rmdir(self, path=None): + + if not path: + # Currently allowing clear to delete everything as in v2 + + # If we disallow an empty path then we will need to modify + # TestABSStoreV3 to have the create_store method use a prefix. + ABSStore.rmdir(self, '') + return + + meta_dir = meta_root + path + meta_dir = meta_dir.rstrip('/') + ABSStore.rmdir(self, meta_dir) + + # remove data folder + data_dir = data_root + path + data_dir = data_dir.rstrip('/') + ABSStore.rmdir(self, data_dir) + + # remove metadata files + sfx = _get_metadata_suffix(self) + array_meta_file = meta_dir + '.array' + sfx + if array_meta_file in self: + del self[array_meta_file] + group_meta_file = meta_dir + '.group' + sfx + if group_meta_file in self: + del self[group_meta_file] + + # TODO: adapt the v2 getsize method to work for v3 + # For now, calling the generic keys-based _getsize + def getsize(self, path=None): + from zarr.storage import _getsize # avoid circular import + return _getsize(self, path) + + +ABSStoreV3.__doc__ = ABSStore.__doc__ diff --git a/zarr/_storage/store.py b/zarr/_storage/store.py index 6f5bf78e28..d1ad930609 100644 --- a/zarr/_storage/store.py +++ b/zarr/_storage/store.py @@ -1,7 +1,9 @@ +import abc from collections.abc import MutableMapping -from typing import Any, List, Optional, Union +from string import ascii_letters, digits +from typing import Any, List, Mapping, Optional, Union -from zarr.meta import Metadata2 +from zarr.meta import Metadata2, Metadata3 from zarr.util import normalize_storage_path # v2 store keys @@ -9,6 +11,12 @@ group_meta_key = '.zgroup' attrs_key = '.zattrs' +# v3 paths +meta_root = 'meta/root/' +data_root = 'data/root/' + +DEFAULT_ZARR_VERSION = 2 + class BaseStore(MutableMapping): """Abstract base class for store implementations. @@ -84,6 +92,10 @@ def _ensure_store(store: Any): if store is None: return None elif isinstance(store, BaseStore): + if not store._store_version == 2: + raise ValueError( + f"cannot initialize a v2 store with a v{store._store_version} store" + ) return store elif isinstance(store, MutableMapping): return KVStore(store) @@ -131,6 +143,161 @@ def rmdir(self, path: str = "") -> None: _rmdir_from_keys(self, path) +class StoreV3(BaseStore): + _store_version = 3 + _metadata_class = Metadata3 + _valid_key_characters = set(ascii_letters + digits + "/.-_") + + def _valid_key(self, key: str) -> bool: + """ + Verify that a key conforms to the specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + """ + if not isinstance(key, str) or not key.isascii(): + return False + if set(key) - self._valid_key_characters: + return False + return True + + def _validate_key(self, key: str): + """ + Verify that a key conforms to the v3 specification. + + A key is any string containing only character in the range a-z, A-Z, + 0-9, or in the set /.-_ it will return True if that's the case, False + otherwise. + + In spec v3, keys can only start with the prefix meta/, data/ or be + exactly zarr.json and should not end with /. This should not be exposed + to the user, and is a store implementation detail, so this method will + raise a ValueError in that case. + """ + if not self._valid_key(key): + raise ValueError( + f"Keys must be ascii strings and may only contain the " + f"characters {''.join(sorted(self._valid_key_characters))}" + ) + + if ( + not key.startswith("data/") + and (not key.startswith("meta/")) + and (not key == "zarr.json") + # TODO: Possibly allow key == ".zmetadata" too if we write a + # consolidated metadata spec corresponding to this? + ): + raise ValueError("keys starts with unexpected value: `{}`".format(key)) + + if key.endswith('/'): + raise ValueError("keys may not end in /") + + def list_prefix(self, prefix): + if prefix.startswith('/'): + raise ValueError("prefix must not begin with /") + # TODO: force prefix to end with /? + return [k for k in self.list() if k.startswith(prefix)] + + def erase(self, key): + self.__delitem__(key) + + def erase_prefix(self, prefix): + assert prefix.endswith("/") + + if prefix == "/": + all_keys = self.list() + else: + all_keys = self.list_prefix(prefix) + for key in all_keys: + self.erase(key) + + def list_dir(self, prefix): + """ + TODO: carefully test this with trailing/leading slashes + """ + if prefix: # allow prefix = "" ? + assert prefix.endswith("/") + + all_keys = self.list_prefix(prefix) + len_prefix = len(prefix) + keys = [] + prefixes = [] + for k in all_keys: + trail = k[len_prefix:] + if "/" not in trail: + keys.append(prefix + trail) + else: + prefixes.append(prefix + trail.split("/", maxsplit=1)[0] + "/") + return keys, list(set(prefixes)) + + def list(self): + return list(self.keys()) + + def __contains__(self, key): + return key in self.list() + + @abc.abstractmethod + def __setitem__(self, key, value): + """Set a value.""" + + @abc.abstractmethod + def __getitem__(self, key): + """Get a value.""" + + def clear(self): + """Remove all items from store.""" + self.erase_prefix("/") + + def __eq__(self, other): + return NotImplemented + + @staticmethod + def _ensure_store(store): + """ + We want to make sure internally that zarr stores are always a class + with a specific interface derived from ``Store``, which is slightly + different than ``MutableMapping``. + + We'll do this conversion in a few places automatically + """ + from zarr.storage import KVStoreV3 # avoid circular import + if store is None: + return None + elif isinstance(store, StoreV3): + return store + elif isinstance(store, Store): + raise ValueError( + f"cannot initialize a v3 store with a v{store._store_version} store" + ) + elif isinstance(store, MutableMapping): + return KVStoreV3(store) + else: + for attr in [ + "keys", + "values", + "get", + "__setitem__", + "__getitem__", + "__delitem__", + "__contains__", + ]: + if not hasattr(store, attr): + break + else: + return KVStoreV3(store) + + raise ValueError( + "v3 stores must be subclasses of StoreV3, " + "if your store exposes the MutableMapping interface wrap it in " + f"Zarr.storage.KVStoreV3. Got {store}" + ) + + +# allow MutableMapping for backwards compatibility +StoreLike = Union[BaseStore, MutableMapping] + + def _path_to_prefix(path: Optional[str]) -> str: # assume path already normalized if path: @@ -140,17 +307,68 @@ def _path_to_prefix(path: Optional[str]) -> str: return prefix +def _get_hierarchy_metadata(store: StoreV3) -> Mapping[str, Any]: + version = getattr(store, '_store_version', 2) + if version < 3: + raise ValueError("zarr.json hierarchy metadata not stored for " + f"zarr v{version} stores") + if 'zarr.json' not in store: + raise ValueError("zarr.json metadata not found in store") + return store._metadata_class.decode_hierarchy_metadata(store['zarr.json']) + + +def _get_metadata_suffix(store: StoreV3) -> str: + if 'zarr.json' in store: + return _get_hierarchy_metadata(store)['metadata_key_suffix'] + return '.json' + + +def _rename_metadata_v3(store: StoreV3, src_path: str, dst_path: str) -> bool: + """Rename source or group metadata file associated with src_path.""" + any_renamed = False + sfx = _get_metadata_suffix(store) + src_path = src_path.rstrip('/') + dst_path = dst_path.rstrip('/') + _src_array_json = meta_root + src_path + '.array' + sfx + if _src_array_json in store: + new_key = meta_root + dst_path + '.array' + sfx + store[new_key] = store.pop(_src_array_json) + any_renamed = True + _src_group_json = meta_root + src_path + '.group' + sfx + if _src_group_json in store: + new_key = meta_root + dst_path + '.group' + sfx + store[new_key] = store.pop(_src_group_json) + any_renamed = True + return any_renamed + + def _rename_from_keys(store: BaseStore, src_path: str, dst_path: str) -> None: # assume path already normalized src_prefix = _path_to_prefix(src_path) dst_prefix = _path_to_prefix(dst_path) - for key in list(store.keys()): - if key.startswith(src_prefix): - new_key = dst_prefix + key.lstrip(src_prefix) - store[new_key] = store.pop(key) - - -def _rmdir_from_keys(store: Union[BaseStore, MutableMapping], path: Optional[str] = None) -> None: + version = getattr(store, '_store_version', 2) + if version == 2: + for key in list(store.keys()): + if key.startswith(src_prefix): + new_key = dst_prefix + key.lstrip(src_prefix) + store[new_key] = store.pop(key) + else: + any_renamed = False + for root_prefix in [meta_root, data_root]: + _src_prefix = root_prefix + src_prefix + _dst_prefix = root_prefix + dst_prefix + for key in store.list_prefix(_src_prefix): # type: ignore + new_key = _dst_prefix + key[len(_src_prefix):] + store[new_key] = store.pop(key) + any_renamed = True + any_meta_renamed = _rename_metadata_v3(store, src_path, dst_path) # type: ignore + any_renamed = any_meta_renamed or any_renamed + + if not any_renamed: + raise ValueError(f"no item {src_path} found to rename") + + +def _rmdir_from_keys(store: StoreLike, path: Optional[str] = None) -> None: # assume path already normalized prefix = _path_to_prefix(path) for key in list(store.keys()): @@ -158,6 +376,27 @@ def _rmdir_from_keys(store: Union[BaseStore, MutableMapping], path: Optional[str del store[key] +def _rmdir_from_keys_v3(store: StoreV3, path: str = "") -> None: + + meta_dir = meta_root + path + meta_dir = meta_dir.rstrip('/') + _rmdir_from_keys(store, meta_dir) + + # remove data folder + data_dir = data_root + path + data_dir = data_dir.rstrip('/') + _rmdir_from_keys(store, data_dir) + + # remove metadata files + sfx = _get_metadata_suffix(store) + array_meta_file = meta_dir + '.array' + sfx + if array_meta_file in store: + store.erase(array_meta_file) # type: ignore + group_meta_file = meta_dir + '.group' + sfx + if group_meta_file in store: + store.erase(group_meta_file) # type: ignore + + def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str]: # assume path already normalized prefix = _path_to_prefix(path) @@ -168,3 +407,40 @@ def _listdir_from_keys(store: BaseStore, path: Optional[str] = None) -> List[str child = suffix.split('/')[0] children.add(child) return sorted(children) + + +def _prefix_to_array_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_metadata_suffix(store) # type: ignore + key = meta_root + prefix.rstrip("/") + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + array_meta_key + return key + + +def _prefix_to_group_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + if prefix: + sfx = _get_metadata_suffix(store) # type: ignore + key = meta_root + prefix.rstrip('/') + ".group" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 group key") + else: + key = prefix + group_meta_key + return key + + +def _prefix_to_attrs_key(store: StoreLike, prefix: str) -> str: + if getattr(store, "_store_version", 2) == 3: + # for v3, attributes are stored in the array metadata + sfx = _get_metadata_suffix(store) # type: ignore + if prefix: + key = meta_root + prefix.rstrip('/') + ".array" + sfx + else: + raise ValueError("prefix must be supplied to get a v3 array key") + else: + key = prefix + attrs_key + return key diff --git a/zarr/attrs.py b/zarr/attrs.py index eff1237db1..39683d45d9 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -1,6 +1,6 @@ from collections.abc import MutableMapping -from zarr._storage.store import Store +from zarr._storage.store import Store, StoreV3 from zarr.util import json_dumps @@ -26,7 +26,10 @@ class Attributes(MutableMapping): def __init__(self, store, key='.zattrs', read_only=False, cache=True, synchronizer=None): - self.store = Store._ensure_store(store) + + self._version = getattr(store, '_store_version', 2) + _Store = Store if self._version == 2 else StoreV3 + self.store = _Store._ensure_store(store) self.key = key self.read_only = read_only self.cache = cache @@ -38,6 +41,8 @@ def _get_nosync(self): data = self.store[self.key] except KeyError: d = dict() + if self._version > 2: + d['attributes'] = {} else: d = self.store._metadata_class.parse_metadata(data) return d @@ -47,6 +52,8 @@ def asdict(self): if self.cache and self._cached_asdict is not None: return self._cached_asdict d = self._get_nosync() + if self._version == 3: + d = d['attributes'] if self.cache: self._cached_asdict = d return d @@ -54,7 +61,10 @@ def asdict(self): def refresh(self): """Refresh cached attributes from the store.""" if self.cache: - self._cached_asdict = self._get_nosync() + if self._version == 2: + self._cached_asdict = self._get_nosync() + else: + self._cached_asdict = self._get_nosync()['attributes'] def __contains__(self, x): return x in self.asdict() @@ -84,7 +94,10 @@ def _setitem_nosync(self, item, value): d = self._get_nosync() # set key value - d[item] = value + if self._version == 2: + d[item] = value + else: + d['attributes'][item] = value # _put modified data self._put_nosync(d) @@ -98,7 +111,10 @@ def _delitem_nosync(self, key): d = self._get_nosync() # delete key value - del d[key] + if self._version == 2: + del d[key] + else: + del d['attributes'][key] # _put modified data self._put_nosync(d) @@ -106,12 +122,34 @@ def _delitem_nosync(self, key): def put(self, d): """Overwrite all attributes with the key/value pairs in the provided dictionary `d` in a single operation.""" - self._write_op(self._put_nosync, d) + if self._version == 2: + self._write_op(self._put_nosync, d) + else: + self._write_op(self._put_nosync, dict(attributes=d)) def _put_nosync(self, d): - self.store[self.key] = json_dumps(d) - if self.cache: - self._cached_asdict = d + if self._version == 2: + self.store[self.key] = json_dumps(d) + if self.cache: + self._cached_asdict = d + else: + if self.key in self.store: + # Cannot write the attributes directly to JSON, but have to + # store it within the pre-existing attributes key of the v3 + # metadata. + + # Note: this changes the store.counter result in test_caching_on! + + meta = self.store._metadata_class.parse_metadata(self.store[self.key]) + if 'attributes' in meta and 'filters' in meta['attributes']: + # need to preserve any existing "filters" attribute + d['attributes']['filters'] = meta['attributes']['filters'] + meta['attributes'] = d['attributes'] + else: + meta = d + self.store[self.key] = json_dumps(meta) + if self.cache: + self._cached_asdict = d['attributes'] # noinspection PyMethodOverriding def update(self, *args, **kwargs): @@ -124,7 +162,10 @@ def _update_nosync(self, *args, **kwargs): d = self._get_nosync() # update - d.update(*args, **kwargs) + if self._version == 2: + d.update(*args, **kwargs) + else: + d['attributes'].update(*args, **kwargs) # _put modified data self._put_nosync(d) diff --git a/zarr/convenience.py b/zarr/convenience.py index 0cb20220f3..2cbc9bdf68 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -5,15 +5,18 @@ import re from collections.abc import Mapping, MutableMapping +from zarr._storage.store import data_root, meta_root from zarr.core import Array from zarr.creation import array as _create_array -from zarr.creation import normalize_store_arg, open_array +from zarr.creation import open_array from zarr.errors import CopyError, PathNotFoundError from zarr.hierarchy import Group from zarr.hierarchy import group as _create_group from zarr.hierarchy import open_group from zarr.meta import json_dumps, json_loads -from zarr.storage import contains_array, contains_group, BaseStore +from zarr.storage import (_get_metadata_suffix, contains_array, contains_group, + normalize_store_arg, BaseStore, ConsolidatedMetadataStore, + ConsolidatedMetadataStoreV3) from zarr.util import TreeViewer, buffer_size, normalize_storage_path from typing import Union @@ -21,8 +24,14 @@ StoreLike = Union[BaseStore, MutableMapping, str, None] +def _check_and_update_path(store: BaseStore, path): + if getattr(store, '_store_version', 2) > 2 and not path: + raise ValueError("path must be provided for v3 stores") + return normalize_storage_path(path) + + # noinspection PyShadowingBuiltins -def open(store: StoreLike = None, mode: str = "a", **kwargs): +def open(store: StoreLike = None, mode: str = "a", *, zarr_version=None, path=None, **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters @@ -34,6 +43,12 @@ def open(store: StoreLike = None, mode: str = "a", **kwargs): read/write (must exist); 'a' means read/write (create if doesn't exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). + zarr_version : {2, 3, None}, optional + The zarr protocol version to use. The default value of None will attempt + to infer the version from `store` if possible, otherwise it will fall + back to 2. + path : str or None, optional + The path within the store to open. **kwargs Additional parameters are passed through to :func:`zarr.creation.open_array` or :func:`zarr.hierarchy.open_group`. @@ -75,14 +90,16 @@ def open(store: StoreLike = None, mode: str = "a", **kwargs): """ - path = kwargs.get('path') # handle polymorphic store arg # we pass storage options explicitly, since normalize_store_arg might construct # a store if the input is a fsspec-compatible URL _store: BaseStore = normalize_store_arg( - store, storage_options=kwargs.pop("storage_options", {}), mode=mode + store, storage_options=kwargs.pop("storage_options", {}), mode=mode, + zarr_version=zarr_version, ) + # path = _check_and_update_path(_store, path) path = normalize_storage_path(path) + kwargs['path'] = path if mode in {'w', 'w-', 'x'}: if 'shape' in kwargs: @@ -109,7 +126,7 @@ def _might_close(path): return isinstance(path, (str, os.PathLike)) -def save_array(store: StoreLike, arr, **kwargs): +def save_array(store: StoreLike, arr, *, zarr_version=None, path=None, **kwargs): """Convenience function to save a NumPy array to the local file system, following a similar API to the NumPy save() function. @@ -119,6 +136,12 @@ def save_array(store: StoreLike, arr, **kwargs): Store or path to directory in file system or name of zip file. arr : ndarray NumPy array with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + path : str or None, optional + The path within the store where the array will be saved. kwargs Passed through to :func:`create`, e.g., compressor. @@ -141,16 +164,18 @@ def save_array(store: StoreLike, arr, **kwargs): """ may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w") + _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + path = _check_and_update_path(_store, path) try: - _create_array(arr, store=_store, overwrite=True, **kwargs) + _create_array(arr, store=_store, overwrite=True, zarr_version=zarr_version, path=path, + **kwargs) finally: if may_need_closing: # needed to ensure zip file records are written _store.close() -def save_group(store: StoreLike, *args, **kwargs): +def save_group(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): """Convenience function to save several NumPy arrays to the local file system, following a similar API to the NumPy savez()/savez_compressed() functions. @@ -160,6 +185,12 @@ def save_group(store: StoreLike, *args, **kwargs): Store or path to directory in file system or name of zip file. args : ndarray NumPy arrays with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + path : str or None, optional + Path within the store where the group will be saved. kwargs NumPy arrays with data to save. @@ -212,21 +243,22 @@ def save_group(store: StoreLike, *args, **kwargs): raise ValueError('at least one array must be provided') # handle polymorphic store arg may_need_closing = _might_close(store) - _store: BaseStore = normalize_store_arg(store, mode="w") + _store: BaseStore = normalize_store_arg(store, mode="w", zarr_version=zarr_version) + path = _check_and_update_path(_store, path) try: - grp = _create_group(_store, overwrite=True) + grp = _create_group(_store, path=path, overwrite=True, zarr_version=zarr_version) for i, arr in enumerate(args): k = 'arr_{}'.format(i) - grp.create_dataset(k, data=arr, overwrite=True) + grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) for k, arr in kwargs.items(): - grp.create_dataset(k, data=arr, overwrite=True) + grp.create_dataset(k, data=arr, overwrite=True, zarr_version=zarr_version) finally: if may_need_closing: # needed to ensure zip file records are written _store.close() -def save(store: StoreLike, *args, **kwargs): +def save(store: StoreLike, *args, zarr_version=None, path=None, **kwargs): """Convenience function to save an array or group of arrays to the local file system. Parameters @@ -235,6 +267,12 @@ def save(store: StoreLike, *args, **kwargs): Store or path to directory in file system or name of zip file. args : ndarray NumPy arrays with data to save. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when saving. The default value of None + will attempt to infer the version from `store` if possible, otherwise + it will fall back to 2. + path : str or None, optional + The path within the group where the arrays will be saved. kwargs NumPy arrays with data to save. @@ -301,9 +339,10 @@ def save(store: StoreLike, *args, **kwargs): if len(args) == 0 and len(kwargs) == 0: raise ValueError('at least one array must be provided') if len(args) == 1 and len(kwargs) == 0: - save_array(store, args[0]) + save_array(store, args[0], zarr_version=zarr_version, path=path) else: - save_group(store, *args, **kwargs) + save_group(store, *args, zarr_version=zarr_version, path=path, + **kwargs) class LazyLoader(Mapping): @@ -336,13 +375,19 @@ def __repr__(self): return r -def load(store: StoreLike): +def load(store: StoreLike, zarr_version=None, path=None): """Load data from an array or group into memory. Parameters ---------- store : MutableMapping or string Store or path to directory in file system or name of zip file. + zarr_version : {2, 3, None}, optional + The zarr protocol version to use when loading. The default value of + None will attempt to infer the version from `store` if possible, + otherwise it will fall back to 2. + path : str or None, optional + The path within the store from which to load. Returns ------- @@ -362,11 +407,12 @@ def load(store: StoreLike): """ # handle polymorphic store arg - _store = normalize_store_arg(store) - if contains_array(_store, path=None): - return Array(store=_store, path=None)[...] - elif contains_group(_store, path=None): - grp = Group(store=_store, path=None) + _store = normalize_store_arg(store, zarr_version=zarr_version) + path = _check_and_update_path(_store, path) + if contains_array(_store, path=path): + return Array(store=_store, path=path)[...] + elif contains_group(_store, path=path): + grp = Group(store=_store, path=path) return LazyLoader(grp) @@ -600,6 +646,16 @@ def copy_store(source, dest, source_path='', dest_path='', excludes=None, # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 + source_store_version = getattr(source, '_store_version', 2) + dest_store_version = getattr(dest, '_store_version', 2) + if source_store_version != dest_store_version: + raise ValueError("zarr stores must share the same protocol version") + + if source_store_version > 2: + nchar_root = len(meta_root) + # code below assumes len(meta_root) === len(data_root) + assert len(data_root) == nchar_root + # setup logging with _LogWriter(log) as log: @@ -607,52 +663,63 @@ def copy_store(source, dest, source_path='', dest_path='', excludes=None, for source_key in sorted(source.keys()): # filter to keys under source path - if source_key.startswith(source_path): + if source_store_version == 2: + if not source_key.startswith(source_path): + continue + elif source_store_version == 3: + # skip 'meta/root/' or 'data/root/' at start of source_key + if not source_key[nchar_root:].startswith(source_path): + continue - # process excludes and includes - exclude = False - for prog in excludes: + # process excludes and includes + exclude = False + for prog in excludes: + if prog.search(source_key): + exclude = True + break + if exclude: + for prog in includes: if prog.search(source_key): - exclude = True + exclude = False break - if exclude: - for prog in includes: - if prog.search(source_key): - exclude = False - break - if exclude: - continue + if exclude: + continue - # map key to destination path + # map key to destination path + if source_store_version == 2: key_suffix = source_key[len(source_path):] dest_key = dest_path + key_suffix - - # create a descriptive label for this operation - descr = source_key - if dest_key != source_key: - descr = descr + ' -> ' + dest_key - - # decide what to do - do_copy = True - if if_exists != 'replace': - if dest_key in dest: - if if_exists == 'raise': - raise CopyError('key {!r} exists in destination' - .format(dest_key)) - elif if_exists == 'skip': - do_copy = False - - # take action - if do_copy: - log('copy {}'.format(descr)) - if not dry_run: - data = source[source_key] - n_bytes_copied += buffer_size(data) - dest[dest_key] = data - n_copied += 1 - else: - log('skip {}'.format(descr)) - n_skipped += 1 + elif source_store_version == 3: + # nchar_root is length of 'meta/root/' or 'data/root/' + key_suffix = source_key[nchar_root + len(source_path):] + dest_key = source_key[:nchar_root] + dest_path + key_suffix + + # create a descriptive label for this operation + descr = source_key + if dest_key != source_key: + descr = descr + ' -> ' + dest_key + + # decide what to do + do_copy = True + if if_exists != 'replace': + if dest_key in dest: + if if_exists == 'raise': + raise CopyError('key {!r} exists in destination' + .format(dest_key)) + elif if_exists == 'skip': + do_copy = False + + # take action + if do_copy: + log('copy {}'.format(descr)) + if not dry_run: + data = source[source_key] + n_bytes_copied += buffer_size(data) + dest[dest_key] = data + n_copied += 1 + else: + log('skip {}'.format(descr)) + n_skipped += 1 # log a final message with a summary of what happened _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) @@ -907,7 +974,15 @@ def _copy(log, source, dest, name, root, shallow, without_attrs, if_exists, # copy attributes if not without_attrs: - ds.attrs.update(source.attrs) + if dest_h5py and 'filters' in source.attrs: + # No filters key in v3 metadata so it was stored in the + # attributes instead. We cannot copy this key to + # HDF5 attrs, though! + source_attrs = source.attrs.asdict().copy() + source_attrs.pop('filters', None) + else: + source_attrs = source.attrs + ds.attrs.update(source_attrs) n_copied += 1 @@ -1063,6 +1138,8 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, # setup counting variables n_copied = n_skipped = n_bytes_copied = 0 + zarr_version = getattr(source, '_version', 2) + # setup logging with _LogWriter(log) as log: @@ -1074,7 +1151,8 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, n_copied += c n_skipped += s n_bytes_copied += b - dest.attrs.update(**source.attrs) + if zarr_version == 2: + dest.attrs.update(**source.attrs) # log a final message with a summary of what happened _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) @@ -1082,7 +1160,7 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, return n_copied, n_skipped, n_bytes_copied -def consolidate_metadata(store: StoreLike, metadata_key=".zmetadata"): +def consolidate_metadata(store: BaseStore, metadata_key=".zmetadata", *, path=''): """ Consolidate all metadata for groups and arrays within the given store into a single resource and put it under the given key. @@ -1105,6 +1183,9 @@ def consolidate_metadata(store: StoreLike, metadata_key=".zmetadata"): Store or path to directory in file system or name of zip file. metadata_key : str Key to put the consolidated metadata under. + path : str or None + Path corresponding to the group that is being consolidated. Not required + for zarr v2 stores. Returns ------- @@ -1118,9 +1199,29 @@ def consolidate_metadata(store: StoreLike, metadata_key=".zmetadata"): """ store = normalize_store_arg(store, mode="w") - def is_zarr_key(key): - return (key.endswith('.zarray') or key.endswith('.zgroup') or - key.endswith('.zattrs')) + version = store._store_version + + if version == 2: + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + else: + + sfx = _get_metadata_suffix(store) # type: ignore + + def is_zarr_key(key): + return (key.endswith('.array' + sfx) or key.endswith('.group' + sfx) or + key == 'zarr.json') + + # cannot create a group without a path in v3 + # so create /meta/root/consolidated group to store the metadata + if 'consolidated' not in store: + _create_group(store, path='consolidated') + if not metadata_key.startswith('meta/root/'): + metadata_key = 'meta/root/consolidated/' + metadata_key + # path = 'consolidated' out = { 'zarr_consolidated_format': 1, @@ -1130,7 +1231,7 @@ def is_zarr_key(key): } } store[metadata_key] = json_dumps(out) - return open_consolidated(store, metadata_key=metadata_key) + return open_consolidated(store, metadata_key=metadata_key, path=path) def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", **kwargs): @@ -1175,17 +1276,28 @@ def open_consolidated(store: StoreLike, metadata_key=".zmetadata", mode="r+", ** """ - from .storage import ConsolidatedMetadataStore - # normalize parameters store = normalize_store_arg(store, storage_options=kwargs.get("storage_options"), mode=mode) if mode not in {'r', 'r+'}: raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}" .format(mode)) + path = kwargs.pop('path', None) + if store._store_version == 2: + ConsolidatedStoreClass = ConsolidatedMetadataStore + else: + ConsolidatedStoreClass = ConsolidatedMetadataStoreV3 + # default is to store within 'consolidated' group on v3 + if not metadata_key.startswith('meta/root/'): + metadata_key = 'meta/root/consolidated/' + metadata_key + if not path: + raise ValueError( + "path must be provided to open a Zarr 3.x consolidated store" + ) + # setup metadata store - meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) + meta_store = ConsolidatedStoreClass(store, metadata_key=metadata_key) # pass through chunk_store = kwargs.pop('chunk_store', None) or store - return open(store=meta_store, chunk_store=chunk_store, mode=mode, **kwargs) + return open(store=meta_store, chunk_store=chunk_store, mode=mode, path=path, **kwargs) diff --git a/zarr/core.py b/zarr/core.py index e0fe4eb0e9..5e2b4252aa 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -4,13 +4,14 @@ import math import operator import re +from collections.abc import MutableMapping from functools import reduce +from typing import Any import numpy as np from numcodecs.compat import ensure_bytes, ensure_ndarray -from collections.abc import MutableMapping - +from zarr._storage.store import _prefix_to_attrs_key from zarr.attrs import Attributes from zarr.codecs import AsType, get_codec from zarr.errors import ArrayNotFoundError, ReadOnlyError, ArrayIndexError @@ -31,7 +32,13 @@ is_scalar, pop_fields, ) -from zarr.storage import array_meta_key, attrs_key, getsize, listdir, BaseStore +from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_array_key, + getsize, + listdir, + normalize_store_arg, +) from zarr.util import ( all_equal, InfoReporter, @@ -146,7 +153,7 @@ class Array: def __init__( self, - store: BaseStore, + store: Any, # BaseStore not stricly required due to normalize_store_arg path=None, read_only=False, chunk_store=None, @@ -155,12 +162,18 @@ def __init__( cache_attrs=True, partial_decompress=False, write_empty_chunks=False, + zarr_version=None, ): # N.B., expect at this point store is fully initialized with all # configuration metadata fully specified and normalized - store = BaseStore._ensure_store(store) - chunk_store = BaseStore._ensure_store(chunk_store) + store = normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = store._store_version + + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, + zarr_version=zarr_version) self._store = store self._chunk_store = chunk_store @@ -175,12 +188,19 @@ def __init__( self._is_view = False self._partial_decompress = partial_decompress self._write_empty_chunks = write_empty_chunks + self._version = zarr_version + + if self._version == 3: + self._data_key_prefix = 'data/root/' + self._key_prefix + self._data_path = 'data/root/' + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) + self._metadata_key_suffix = self._hierarchy_metadata['metadata_key_suffix'] # initialize metadata self._load_metadata() # initialize attributes - akey = self._key_prefix + attrs_key + akey = _prefix_to_attrs_key(self._store, self._key_prefix) self._attrs = Attributes(store, key=akey, read_only=read_only, synchronizer=synchronizer, cache=cache_attrs) @@ -196,13 +216,13 @@ def _load_metadata(self): if self._synchronizer is None: self._load_metadata_nosync() else: - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) with self._synchronizer[mkey]: self._load_metadata_nosync() def _load_metadata_nosync(self): try: - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) meta_bytes = self._store[mkey] except KeyError: raise ArrayNotFoundError(self._path) @@ -212,32 +232,47 @@ def _load_metadata_nosync(self): meta = self._store._metadata_class.decode_array_metadata(meta_bytes) self._meta = meta self._shape = meta['shape'] - self._chunks = meta['chunks'] - self._dtype = meta['dtype'] self._fill_value = meta['fill_value'] - self._order = meta['order'] - dimension_separator = meta.get('dimension_separator', None) - if dimension_separator is None: - try: - dimension_separator = self._store._dimension_separator - except (AttributeError, KeyError): - pass - - # Fallback for any stores which do not choose a default + if self._version == 2: + self._chunks = meta['chunks'] + self._dtype = meta['dtype'] + self._order = meta['order'] if dimension_separator is None: - dimension_separator = "." + try: + dimension_separator = self._store._dimension_separator + except (AttributeError, KeyError): + pass + + # Fallback for any stores which do not choose a default + if dimension_separator is None: + dimension_separator = "." + else: + self._chunks = meta['chunk_grid']['chunk_shape'] + self._dtype = meta['data_type'] + self._order = meta['chunk_memory_layout'] + chunk_separator = meta['chunk_grid']['separator'] + if dimension_separator is None: + dimension_separator = meta.get('dimension_separator', chunk_separator) + self._dimension_separator = dimension_separator # setup compressor - config = meta['compressor'] - if config is None: + compressor = meta.get('compressor', None) + if compressor is None: self._compressor = None + elif self._version == 2: + self._compressor = get_codec(compressor) else: - self._compressor = get_codec(config) + self._compressor = compressor # setup filters - filters = meta['filters'] + if self._version == 2: + filters = meta.get('filters', []) + else: + # TODO: storing filters under attributes for now since the v3 + # array metadata does not have a 'filters' attribute. + filters = meta['attributes'].get('filters', []) if filters: filters = [get_codec(config) for config in filters] self._filters = filters @@ -262,10 +297,23 @@ def _flush_metadata_nosync(self): filters_config = [f.get_config() for f in self._filters] else: filters_config = None - meta = dict(shape=self._shape, chunks=self._chunks, dtype=self._dtype, - compressor=compressor_config, fill_value=self._fill_value, - order=self._order, filters=filters_config) - mkey = self._key_prefix + array_meta_key + _compressor = compressor_config if self._version == 2 else self._compressor + meta = dict(shape=self._shape, compressor=_compressor, + fill_value=self._fill_value, filters=filters_config) + if getattr(self._store, '_store_version', 2) == 2: + meta.update( + dict(chunks=self._chunks, dtype=self._dtype, order=self._order) + ) + else: + meta.update( + dict(chunk_grid=dict(type='regular', + chunk_shape=self._chunks, + separator=self._dimension_separator), + data_type=self._dtype, + chunk_memory_layout=self._order, + attributes=self.attrs.asdict()) + ) + mkey = _prefix_to_array_key(self._store, self._key_prefix) self._store[mkey] = self._store._metadata_class.encode_array_metadata(meta) @property @@ -453,11 +501,28 @@ def nchunks(self): def nchunks_initialized(self): """The number of chunks that have been initialized with some data.""" - # key pattern for chunk keys - prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) - # count chunk keys - return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) + if self._version == 3: + # # key pattern for chunk keys + # prog = re.compile(r'\.'.join([r'c\d+'] * min(1, self.ndim))) + # # get chunk keys, excluding the prefix + # members = self.chunk_store.list_prefix(self._data_path) + # members = [k.split(self._data_key_prefix)[1] for k in members] + # # count the chunk keys + # return sum(1 for k in members if prog.match(k)) + + # key pattern for chunk keys + prog = re.compile(self._data_key_prefix + r'c\d+') # TODO: ndim == 0 case? + # get chunk keys, excluding the prefix + members = self.chunk_store.list_prefix(self._data_path) + # count the chunk keys + return sum(1 for k in members if prog.match(k)) + else: + # key pattern for chunk keys + prog = re.compile(r'\.'.join([r'\d+'] * min(1, self.ndim))) + + # count chunk keys + return sum(1 for k in listdir(self.chunk_store, self._path) if prog.match(k)) # backwards compatibility initialized = nchunks_initialized @@ -2061,7 +2126,15 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None): return chunk def _chunk_key(self, chunk_coords): - return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) + if self._version == 3: + # _chunk_key() corresponds to data_key(P, i, j, ...) example in the spec + # where P = self._key_prefix, i, j, ... = chunk_coords + # e.g. c0/2/3 for 3d array with chunk index (0, 2, 3) + # https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/core/v3.0.html#regular-grids + return ("data/root/" + self._key_prefix + + "c" + self._dimension_separator.join(map(str, chunk_coords))) + else: + return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords)) def _decode_chunk(self, cdata, start=None, nitems=None, expected_shape=None): # decompress @@ -2242,7 +2315,8 @@ def digest(self, hashname="sha1"): for i in itertools.product(*[range(s) for s in self.cdata_shape]): h.update(self.chunk_store.get(self._chunk_key(i), b"")) - h.update(self.store.get(self._key_prefix + array_meta_key, b"")) + mkey = _prefix_to_array_key(self._store, self._key_prefix) + h.update(self.store.get(mkey, b"")) h.update(self.store.get(self.attrs.key, b"")) @@ -2279,7 +2353,7 @@ def hexdigest(self, hashname="sha1"): def __getstate__(self): return (self._store, self._path, self._read_only, self._chunk_store, self._synchronizer, self._cache_metadata, self._attrs.cache, - self._partial_decompress, self._write_empty_chunks) + self._partial_decompress, self._write_empty_chunks, self._version) def __setstate__(self, state): self.__init__(*state) @@ -2292,7 +2366,7 @@ def _synchronized_op(self, f, *args, **kwargs): else: # synchronize on the array - mkey = self._key_prefix + array_meta_key + mkey = _prefix_to_array_key(self._store, self._key_prefix) lock = self._synchronizer[mkey] with lock: @@ -2559,7 +2633,7 @@ def view(self, shape=None, chunks=None, dtype=None, if synchronizer is None: synchronizer = self._synchronizer a = Array(store=store, path=path, chunk_store=chunk_store, read_only=read_only, - synchronizer=synchronizer, cache_metadata=True) + synchronizer=synchronizer, cache_metadata=True, zarr_version=self._version) a._is_view = True # allow override of some properties diff --git a/zarr/creation.py b/zarr/creation.py index 9d6902a6e3..b8c40a859b 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -3,6 +3,7 @@ import numpy as np from numcodecs.registry import codec_registry +from zarr._storage.store import DEFAULT_ZARR_VERSION from zarr.core import Array from zarr.errors import ( ArrayNotFoundError, @@ -19,8 +20,8 @@ def create(shape, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, path=None, chunk_store=None, filters=None, cache_metadata=True, cache_attrs=True, read_only=False, - object_codec=None, dimension_separator=None, - write_empty_chunks=False, **kwargs): + object_codec=None, dimension_separator=None, write_empty_chunks=True, + *, zarr_version=None, **kwargs): """Create an array. Parameters @@ -80,8 +81,13 @@ def create(shape, chunks=True, dtype=None, compressor='default', deleted. This setting enables sparser storage, as only chunks with non-fill-value data are stored, at the expense of overhead associated with checking the data of each chunk. + .. versionadded:: 2.11 + zarr_version : {None, 2, 3}, optional + The zarr protocol version of the created array. If None, it will be + inferred from ``store`` or ``chunk_store`` if they are provided, + otherwise defaulting to 2. Returns ------- @@ -126,9 +132,12 @@ def create(shape, chunks=True, dtype=None, compressor='default', """ + if zarr_version is None and store is None: + zarr_version = getattr(chunk_store, '_store_version', DEFAULT_ZARR_VERSION) # handle polymorphic store arg - store = normalize_store_arg(store) + store = normalize_store_arg(store, zarr_version=zarr_version) + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) # API compatibility with h5py compressor, fill_value = _kwargs_compat(compressor, fill_value, kwargs) @@ -145,6 +154,9 @@ def create(shape, chunks=True, dtype=None, compressor='default', f"{store_separator}") dimension_separator = normalize_dimension_separator(dimension_separator) + if zarr_version > 2 and path is None: + raise ValueError("path must be supplied to initialize a zarr v3 array") + # initialize array metadata init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, overwrite=overwrite, path=path, @@ -392,6 +404,9 @@ def open_array( storage_options=None, partial_decompress=False, write_empty_chunks=False, + *, + zarr_version=None, + dimension_separator=None, **kwargs ): """Open an array using file-mode-like semantics. @@ -454,8 +469,19 @@ def open_array( deleted. This setting enables sparser storage, as only chunks with non-fill-value data are stored, at the expense of overhead associated with checking the data of each chunk. + .. versionadded:: 2.11 + zarr_version : {None, 2, 3}, optional + The zarr protocol version of the array to be opened. If None, it will + be inferred from ``store`` or ``chunk_store`` if they are provided, + otherwise defaulting to 2. + dimension_separator : {None, '.', '/'}, optional + Can be used to specify whether the array is in a flat ('.') or nested + ('/') format. If None, the appropriate value will be read from `store` + when present. Otherwise, defaults to '.' when ``zarr_version == 2`` + and `/` otherwise. + Returns ------- z : zarr.core.Array @@ -489,12 +515,29 @@ def open_array( # w- or x : create, fail if exists # a : read/write if exists, create otherwise (default) + if zarr_version is None and store is None: + zarr_version = getattr(chunk_store, '_store_version', DEFAULT_ZARR_VERSION) + # handle polymorphic store arg - store = normalize_store_arg(store, storage_options=storage_options, mode=mode) + store = normalize_store_arg(store, storage_options=storage_options, + mode=mode, zarr_version=zarr_version) + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) if chunk_store is not None: chunk_store = normalize_store_arg(chunk_store, storage_options=storage_options, - mode=mode) + mode=mode, + zarr_version=zarr_version) + + # respect the dimension separator specified in a store, if present + if dimension_separator is None: + if hasattr(store, '_dimension_separator'): + dimension_separator = store._dimension_separator + else: + dimension_separator = '.' if zarr_version == 2 else '/' + + if zarr_version == 3 and path is None: + path = 'array' # TODO: raise ValueError instead? + path = normalize_storage_path(path) # API compatibility with h5py @@ -516,7 +559,8 @@ def open_array( init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, overwrite=True, path=path, - object_codec=object_codec, chunk_store=chunk_store) + object_codec=object_codec, chunk_store=chunk_store, + dimension_separator=dimension_separator) elif mode == 'a': if not contains_array(store, path=path): @@ -525,7 +569,8 @@ def open_array( init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec, chunk_store=chunk_store) + object_codec=object_codec, chunk_store=chunk_store, + dimension_separator=dimension_separator) elif mode in ['w-', 'x']: if contains_group(store, path=path): @@ -536,7 +581,8 @@ def open_array( init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec, chunk_store=chunk_store) + object_codec=object_codec, chunk_store=chunk_store, + dimension_separator=dimension_separator) # determine read only status read_only = mode == 'r' @@ -564,6 +610,7 @@ def _like_args(a, kwargs): kwargs.setdefault('compressor', a.compressor) kwargs.setdefault('order', a.order) kwargs.setdefault('filters', a.filters) + kwargs.setdefault('zarr_version', a._version) else: kwargs.setdefault('compressor', 'default') kwargs.setdefault('order', 'C') diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 31c9e2a8d2..0684be4a57 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -3,11 +3,11 @@ import numpy as np +from zarr._storage.store import _get_metadata_suffix, data_root, meta_root, DEFAULT_ZARR_VERSION from zarr.attrs import Attributes from zarr.core import Array from zarr.creation import (array, create, empty, empty_like, full, full_like, - normalize_store_arg, ones, ones_like, zeros, - zeros_like) + ones, ones_like, zeros, zeros_like) from zarr.errors import ( ContainsArrayError, ContainsGroupError, @@ -15,14 +15,18 @@ ReadOnlyError, ) from zarr.storage import ( + _get_hierarchy_metadata, + _prefix_to_group_key, BaseStore, MemoryStore, + MemoryStoreV3, attrs_key, contains_array, contains_group, group_meta_key, init_group, listdir, + normalize_store_arg, rename, rmdir, ) @@ -109,9 +113,12 @@ class Group(MutableMapping): """ def __init__(self, store, path=None, read_only=False, chunk_store=None, - cache_attrs=True, synchronizer=None): - store: BaseStore = BaseStore._ensure_store(store) - chunk_store: BaseStore = BaseStore._ensure_store(chunk_store) + cache_attrs=True, synchronizer=None, zarr_version=None): + store: BaseStore = _normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) + if chunk_store is not None: + chunk_store: BaseStore = _normalize_store_arg(chunk_store, zarr_version=zarr_version) self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) @@ -121,6 +128,13 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, self._key_prefix = '' self._read_only = read_only self._synchronizer = synchronizer + self._version = zarr_version + + if self._version == 3: + self._data_key_prefix = data_root + self._key_prefix + self._data_path = data_root + self._path + self._hierarchy_metadata = _get_hierarchy_metadata(store=self._store) + self._metadata_key_suffix = _get_metadata_suffix(store=self._store) # guard conditions if contains_array(store, path=self._path): @@ -128,15 +142,29 @@ def __init__(self, store, path=None, read_only=False, chunk_store=None, # initialize metadata try: - mkey = self._key_prefix + group_meta_key + mkey = _prefix_to_group_key(self._store, self._key_prefix) + assert not mkey.endswith("root/.group") meta_bytes = store[mkey] except KeyError: - raise GroupNotFoundError(path) + if self._version == 2: + raise GroupNotFoundError(path) + else: + implicit_prefix = meta_root + self._key_prefix + if self._store.list_prefix(implicit_prefix): + # implicit group does not have any metadata + self._meta = None + else: + raise GroupNotFoundError(path) else: self._meta = self._store._metadata_class.decode_group_metadata(meta_bytes) # setup attributes - akey = self._key_prefix + attrs_key + if self._version == 2: + akey = self._key_prefix + attrs_key + else: + # Note: mkey doesn't actually exist for implicit groups, but the + # object can still be created. + akey = mkey self._attrs = Attributes(store, key=akey, read_only=read_only, cache=cache_attrs, synchronizer=synchronizer) @@ -227,11 +255,36 @@ def __iter__(self): quux """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if (contains_array(self._store, path) or - contains_group(self._store, path)): - yield key + if getattr(self._store, '_store_version', 2) == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if (contains_array(self._store, path) or + contains_group(self._store, path)): + yield key + else: + # TODO: Should this iterate over data folders and/or metadata + # folders and/or metadata files + + dir_path = meta_root + self._key_prefix + name_start = len(dir_path) + keys, prefixes = self._store.list_dir(dir_path) + + # yield any groups or arrays + sfx = self._metadata_key_suffix + for key in keys: + len_suffix = len('.group') + len(sfx) # same for .array + if key.endswith(('.group' + sfx, '.array' + sfx)): + yield key[name_start:-len_suffix] + + # also yield any implicit groups + for prefix in prefixes: + prefix = prefix.rstrip('/') + # only implicit if there is no .group.sfx file + if not prefix + '.group' + sfx in self._store: + yield prefix[name_start:] + + # Note: omit data/root/ to avoid duplicate listings + # any group in data/root/ must has an entry in meta/root/ def __len__(self): """Number of members.""" @@ -325,7 +378,7 @@ def __contains__(self, item): """ path = self._item_path(item) return contains_array(self._store, path) or \ - contains_group(self._store, path) + contains_group(self._store, path, explicit_only=False) def __getitem__(self, item): """Obtain a group member. @@ -352,11 +405,21 @@ def __getitem__(self, item): if contains_array(self._store, path): return Array(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, - synchronizer=self._synchronizer, cache_attrs=self.attrs.cache) - elif contains_group(self._store, path): + synchronizer=self._synchronizer, cache_attrs=self.attrs.cache, + zarr_version=self._version) + elif contains_group(self._store, path, explicit_only=True): return Group(self._store, read_only=self._read_only, path=path, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) + elif self._version == 3: + implicit_group = meta_root + path + '/' + # non-empty folder in the metadata path implies an implicit group + if self._store.list_prefix(implicit_group): + return Group(self._store, read_only=self._read_only, path=path, + chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, zarr_version=self._version) + else: + raise KeyError(item) else: raise KeyError(item) @@ -369,7 +432,7 @@ def __delitem__(self, item): def _delitem_nosync(self, item): path = self._item_path(item) if contains_array(self._store, path) or \ - contains_group(self._store, path): + contains_group(self._store, path, explicit_only=False): rmdir(self._store, path) else: raise KeyError(item) @@ -406,10 +469,23 @@ def group_keys(self): ['bar', 'foo'] """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path): + yield key + else: + dir_name = meta_root + self._path + group_sfx = '.group' + self._metadata_key_suffix + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(group_sfx): + key = key[:-len(group_sfx)] + path = self._key_prefix + key + if path.endswith(".array" + self._metadata_key_suffix): + # skip array keys + continue + if contains_group(self._store, path, explicit_only=False): + yield key def groups(self): """Return an iterator over (name, value) pairs for groups only. @@ -428,13 +504,38 @@ def groups(self): foo """ - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_group(self._store, path): - yield key, Group(self._store, path=path, read_only=self._read_only, - chunk_store=self._chunk_store, - cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + if contains_group(self._store, path, explicit_only=False): + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version) + + else: + dir_name = meta_root + self._path + group_sfx = '.group' + self._metadata_key_suffix + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(group_sfx): + key = key[:-len(group_sfx)] + path = self._key_prefix + key + if path.endswith(".array" + self._metadata_key_suffix): + # skip array keys + continue + if contains_group(self._store, path, explicit_only=False): + yield key, Group( + self._store, + path=path, + read_only=self._read_only, + chunk_store=self._chunk_store, + cache_attrs=self.attrs.cache, + synchronizer=self._synchronizer, + zarr_version=self._version) def array_keys(self, recurse=False): """Return an iterator over member names for arrays only. @@ -491,14 +592,35 @@ def arrays(self, recurse=False): recurse=recurse) def _array_iter(self, keys_only, method, recurse): - for key in sorted(listdir(self._store, self._path)): - path = self._key_prefix + key - if contains_array(self._store, path): - yield key if keys_only else (key, self[key]) - elif recurse and contains_group(self._store, path): - group = self[key] - for i in getattr(group, method)(recurse=recurse): - yield i + if self._version == 2: + for key in sorted(listdir(self._store, self._path)): + path = self._key_prefix + key + assert not path.startswith("meta") + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + for i in getattr(group, method)(recurse=recurse): + yield i + else: + dir_name = meta_root + self._path + array_sfx = '.array' + self._metadata_key_suffix + for key in sorted(listdir(self._store, dir_name)): + if key.endswith(array_sfx): + key = key[:-len(array_sfx)] + path = self._key_prefix + key + assert not path.startswith("meta") + if key.endswith('.group' + self._metadata_key_suffix): + # skip group metadata keys + continue + if contains_array(self._store, path): + _key = key.rstrip("/") + yield _key if keys_only else (_key, self[key]) + elif recurse and contains_group(self._store, path): + group = self[key] + for i in getattr(group, method)(recurse=recurse): + yield i def visitvalues(self, func): """Run ``func`` on each object. @@ -707,7 +829,7 @@ def _create_group_nosync(self, name, overwrite=False): return Group(self._store, path=path, read_only=self._read_only, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) def create_groups(self, *names, **kwargs): """Convenience method to create multiple groups in a single call.""" @@ -751,7 +873,7 @@ def _require_group_nosync(self, name, overwrite=False): return Group(self._store, path=path, read_only=self._read_only, chunk_store=self._chunk_store, cache_attrs=self.attrs.cache, - synchronizer=self._synchronizer) + synchronizer=self._synchronizer, zarr_version=self._version) def require_groups(self, *names): """Convenience method to require multiple groups in a single call.""" @@ -1039,9 +1161,10 @@ def move(self, source, dest): # Check that source exists. if not (contains_array(self._store, source) or - contains_group(self._store, source)): + contains_group(self._store, source, explicit_only=False)): raise ValueError('The source, "%s", does not exist.' % source) - if contains_array(self._store, dest) or contains_group(self._store, dest): + if (contains_array(self._store, dest) or + contains_group(self._store, dest, explicit_only=False)): raise ValueError('The dest, "%s", already exists.' % dest) # Ensure groups needed for `dest` exist. @@ -1051,15 +1174,19 @@ def move(self, source, dest): self._write_op(self._move_nosync, source, dest) -def _normalize_store_arg(store, *, storage_options=None, mode="r"): +def _normalize_store_arg(store, *, storage_options=None, mode="r", + zarr_version=None): + if zarr_version is None: + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) if store is None: - return MemoryStore() + return MemoryStore() if zarr_version == 2 else MemoryStoreV3() return normalize_store_arg(store, - storage_options=storage_options, mode=mode) + storage_options=storage_options, mode=mode, + zarr_version=zarr_version) def group(store=None, overwrite=False, chunk_store=None, - cache_attrs=True, synchronizer=None, path=None): + cache_attrs=True, synchronizer=None, path=None, *, zarr_version=None): """Create a group. Parameters @@ -1104,20 +1231,29 @@ def group(store=None, overwrite=False, chunk_store=None, """ # handle polymorphic store arg - store = _normalize_store_arg(store) + store = _normalize_store_arg(store, zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) + if zarr_version == 3 and path is None: + raise ValueError(f"path must be provided for a v{zarr_version} group") path = normalize_storage_path(path) - # require group - if overwrite or not contains_group(store): + if zarr_version == 2: + requires_init = overwrite or not contains_group(store) + elif zarr_version == 3: + requires_init = overwrite or not contains_group(store, path) + + if requires_init: init_group(store, overwrite=overwrite, chunk_store=chunk_store, path=path) return Group(store, read_only=False, chunk_store=chunk_store, - cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) + cache_attrs=cache_attrs, synchronizer=synchronizer, path=path, + zarr_version=zarr_version) def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, - chunk_store=None, storage_options=None): + chunk_store=None, storage_options=None, *, zarr_version=None): """Open a group using file-mode-like semantics. Parameters @@ -1165,12 +1301,23 @@ def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=N # handle polymorphic store arg store = _normalize_store_arg( - store, storage_options=storage_options, mode=mode - ) + store, storage_options=storage_options, mode=mode, + zarr_version=zarr_version) + if zarr_version is None: + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) if chunk_store is not None: chunk_store = _normalize_store_arg(chunk_store, storage_options=storage_options, mode=mode) + if not getattr(chunk_store, '_store_version', DEFAULT_ZARR_VERSION) == zarr_version: + raise ValueError( + "zarr_version of store and chunk_store must match" + ) + + store_version = getattr(store, '_store_version', 2) + if store_version == 3 and path is None: + raise ValueError("path must be supplied to initialize a zarr v3 group") + path = normalize_storage_path(path) # ensure store is initialized @@ -1202,4 +1349,5 @@ def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=N read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path, chunk_store=chunk_store) + synchronizer=synchronizer, path=path, chunk_store=chunk_store, + zarr_version=zarr_version) diff --git a/zarr/meta.py b/zarr/meta.py index c292b09a14..bb4bae4199 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -1,14 +1,87 @@ import base64 +import itertools from collections.abc import Mapping +import numcodecs import numpy as np +from numcodecs.abc import Codec from zarr.errors import MetadataError from zarr.util import json_dumps, json_loads -from typing import cast, Union, Any, List, Mapping as MappingType +from typing import cast, Union, Any, List, Mapping as MappingType, Optional ZARR_FORMAT = 2 +ZARR_FORMAT_v3 = 3 + +# FLOAT_FILLS = {"NaN": np.nan, "Infinity": np.PINF, "-Infinity": np.NINF} + +_default_entry_point_metadata_v3 = { + "zarr_format": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_encoding": "https://purl.org/zarr/spec/protocol/core/3.0", + "metadata_key_suffix": ".json", + "extensions": [], +} + +_v3_core_types = set( + "".join(d) for d in itertools.product("<>", ("u", "i", "f"), ("2", "4", "8")) +) +_v3_core_types = {"bool", "i1", "u1"} | _v3_core_types + +# The set of complex types allowed ({"c8", ">c16"}) +_v3_complex_types = set( + f"{end}c{_bytes}" for end, _bytes in itertools.product("<>", ("8", "16")) +) + +# All dtype.str values corresponding to datetime64 and timedelta64 +# see: https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units +_date_units = ["Y", "M", "W", "D"] +_time_units = ["h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as"] +_v3_datetime_types = set( + f"{end}{kind}8[{unit}]" + for end, unit, kind in itertools.product("<>", _date_units + _time_units, ('m', 'M')) +) + + +def get_extended_dtype_info(dtype) -> dict: + if dtype.str in _v3_complex_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/complex-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str == "|O": + return dict( + extension="TODO: object array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|S"): + return dict( + extension="TODO: bytestring array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("U"): + return dict( + extension="TODO: unicode array protocol URL", # noqa + type=dtype.str, + fallback=None, + ) + elif dtype.str.startswith("|V"): + return dict( + extension="TODO: structured array protocol URL", # noqa + type=dtype.descr, + fallback=None, + ) + elif dtype.str in _v3_datetime_types: + return dict( + extension="https://zarr-specs.readthedocs.io/en/core-protocol-v3.0-dev/protocol/extensions/datetime-dtypes/v1.0.html", # noqa + type=dtype.str, + fallback=None, + ) + else: + raise ValueError(f"Unsupport dtype: {dtype}") class Metadata2: @@ -46,12 +119,13 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A dtype = cls.decode_dtype(meta["dtype"]) if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None dimension_separator = meta.get("dimension_separator", None) - fill_value = cls.decode_fill_value(meta['fill_value'], dtype, object_codec) + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) meta = dict( zarr_format=meta["zarr_format"], shape=tuple(meta["shape"]), @@ -63,7 +137,7 @@ def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, A filters=meta["filters"], ) if dimension_separator: - meta['dimension_separator'] = dimension_separator + meta["dimension_separator"] = dimension_separator except Exception as e: raise MetadataError("error decoding metadata") from e else: @@ -79,7 +153,8 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: dimension_separator = meta.get("dimension_separator") if dtype.hasobject: import numcodecs - object_codec = numcodecs.get_codec(meta['filters'][0]) + + object_codec = numcodecs.get_codec(meta["filters"][0]) else: object_codec = None @@ -93,9 +168,6 @@ def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: order=meta["order"], filters=meta["filters"], ) - if dimension_separator: - meta['dimension_separator'] = dimension_separator - if dimension_separator: meta["dimension_separator"] = dimension_separator @@ -141,13 +213,15 @@ def encode_group_metadata(cls, meta=None) -> bytes: return json_dumps(meta) @classmethod - def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def decode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = base64.standard_b64decode(v) v = object_codec.decode(v) v = np.array(v, dtype=dtype)[()] @@ -189,15 +263,17 @@ def decode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return np.array(v, dtype=dtype)[()] @classmethod - def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> Any: + def encode_fill_value( + cls, v: Any, dtype: np.dtype, object_codec: Any = None + ) -> Any: # early out if v is None: return v - if dtype.kind == 'V' and dtype.hasobject: + if dtype.kind == "V" and dtype.hasobject: if object_codec is None: - raise ValueError('missing object_codec for object array') + raise ValueError("missing object_codec for object array") v = object_codec.encode(v) - v = str(base64.standard_b64encode(v), 'ascii') + v = str(base64.standard_b64encode(v), "ascii") return v if dtype.kind == "f": if np.isnan(v): @@ -214,8 +290,10 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return bool(v) elif dtype.kind in "c": c = cast(np.complex128, np.dtype(complex).type()) - v = (cls.encode_fill_value(v.real, c.real.dtype, object_codec), - cls.encode_fill_value(v.imag, c.imag.dtype, object_codec)) + v = ( + cls.encode_fill_value(v.real, c.real.dtype, object_codec), + cls.encode_fill_value(v.imag, c.imag.dtype, object_codec), + ) return v elif dtype.kind in "SV": v = str(base64.standard_b64encode(v), "ascii") @@ -228,7 +306,235 @@ def encode_fill_value(cls, v: Any, dtype: np.dtype, object_codec: Any = None) -> return v -# expose class methods for backwards compatibility +class Metadata3(Metadata2): + ZARR_FORMAT = ZARR_FORMAT_v3 + + @classmethod + def decode_dtype(cls, d, validate=True): + if isinstance(d, dict): + # extract the type from the extension info + try: + d = d['type'] + except KeyError: + raise KeyError( + "Extended dtype info must provide a key named 'type'." + ) + d = cls._decode_dtype_descr(d) + dtype = np.dtype(d) + if validate: + if dtype.str in (_v3_core_types | {"|b1", "|u1", "|i1"}): + # it is a core dtype of the v3 spec + pass + else: + # will raise if this is not a recognized extended dtype + get_extended_dtype_info(dtype) + return dtype + + @classmethod + def encode_dtype(cls, d): + s = d.str + if s == "|b1": + return "bool" + elif s == "|u1": + return "u1" + elif s == "|i1": + return "i1" + elif s in _v3_core_types: + return Metadata2.encode_dtype(d) + else: + # Check if this dtype corresponds to a supported extension to + # the v3 protocol. + return get_extended_dtype_info(np.dtype(d)) + + @classmethod + def decode_group_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # 1 / 0 + # # check metadata format version + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != cls.ZARR_FORMAT: + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + + assert "attributes" in meta + # meta = dict(attributes=meta['attributes']) + return meta + + # return json.loads(s) + + @classmethod + def encode_group_metadata(cls, meta=None) -> bytes: + # The ZARR_FORMAT should not be in the group metadata, but in the + # entry point metadata instead + # meta = dict(zarr_format=cls.ZARR_FORMAT) + if meta is None: + meta = {"attributes": {}} + meta = dict(attributes=meta.get("attributes", {})) + return json_dumps(meta) + + @classmethod + def encode_hierarchy_metadata(cls, meta=None) -> bytes: + if meta is None: + meta = _default_entry_point_metadata_v3 + elif set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metadata. meta={meta}") + return json_dumps(meta) + + @classmethod + def decode_hierarchy_metadata( + cls, s: Union[MappingType, str] + ) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + # check metadata format + # zarr_format = meta.get("zarr_format", None) + # if zarr_format != "https://purl.org/zarr/spec/protocol/core/3.0": + # raise MetadataError("unsupported zarr format: %s" % zarr_format) + if set(meta.keys()) != { + "zarr_format", + "metadata_encoding", + "metadata_key_suffix", + "extensions", + }: + raise ValueError(f"Unexpected keys in metdata. meta={meta}") + return meta + + @classmethod + def _encode_codec_metadata(cls, codec: Codec) -> Optional[Mapping]: + if codec is None: + return None + + # only support gzip for now + config = codec.get_config() + del config["id"] + uri = 'https://purl.org/zarr/spec/codec/' + if isinstance(codec, numcodecs.GZip): + uri = uri + "gzip/1.0" + elif isinstance(codec, numcodecs.Zlib): + uri = uri + "zlib/1.0" + elif isinstance(codec, numcodecs.Blosc): + uri = uri + "blosc/1.0" + elif isinstance(codec, numcodecs.BZ2): + uri = uri + "bz2/1.0" + elif isinstance(codec, numcodecs.LZ4): + uri = uri + "lz4/1.0" + elif isinstance(codec, numcodecs.LZMA): + uri = uri + "lzma/1.0" + meta = { + "codec": uri, + "configuration": config, + } + return meta + + @classmethod + def _decode_codec_metadata(cls, meta: Optional[Mapping]) -> Optional[Codec]: + if meta is None: + return None + + uri = 'https://purl.org/zarr/spec/codec/' + conf = meta['configuration'] + if meta['codec'].startswith(uri + 'gzip/'): + codec = numcodecs.GZip(level=conf['level']) + elif meta['codec'].startswith(uri + 'zlib/'): + codec = numcodecs.Zlib(level=conf['level']) + elif meta['codec'].startswith(uri + 'blosc/'): + codec = numcodecs.Blosc(clevel=conf['clevel'], + shuffle=conf['shuffle'], + blocksize=conf['blocksize'], + cname=conf['cname']) + elif meta['codec'].startswith(uri + 'bz2/'): + codec = numcodecs.BZ2(level=conf['level']) + elif meta['codec'].startswith(uri + 'lz4/'): + codec = numcodecs.LZ4(acceleration=conf['acceleration']) + elif meta['codec'].startswith(uri + 'lzma/'): + codec = numcodecs.LZMA(format=conf['format'], + check=conf['check'], + preset=conf['preset'], + filters=conf['filters']) + else: + raise NotImplementedError + + return codec + + @classmethod + def decode_array_metadata(cls, s: Union[MappingType, str]) -> MappingType[str, Any]: + meta = cls.parse_metadata(s) + + # extract array metadata fields + try: + dtype = cls.decode_dtype(meta["data_type"]) + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) + else: + object_codec = None + fill_value = cls.decode_fill_value(meta["fill_value"], dtype, object_codec) + # TODO: remove dimension_separator? + + compressor = cls._decode_codec_metadata(meta.get("compressor", None)) + extensions = meta.get("extensions", []) + meta = dict( + shape=tuple(meta["shape"]), + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=dtype, + fill_value=fill_value, + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta["attributes"], + extensions=extensions, + ) + # compressor field should be absent when there is no compression + if compressor: + meta['compressor'] = compressor + + except Exception as e: + raise MetadataError("error decoding metadata: %s" % e) + else: + return meta + + @classmethod + def encode_array_metadata(cls, meta: MappingType[str, Any]) -> bytes: + dtype = meta["data_type"] + sdshape = () + if dtype.subdtype is not None: + dtype, sdshape = dtype.subdtype + dimension_separator = meta.get("dimension_separator") + if dtype.hasobject: + import numcodecs + + object_codec = numcodecs.get_codec(meta["attributes"]["filters"][0]) + else: + object_codec = None + + compressor = cls._encode_codec_metadata(meta.get("compressor", None)) + extensions = meta.get("extensions", []) + meta = dict( + shape=meta["shape"] + sdshape, + chunk_grid=dict( + type=meta["chunk_grid"]["type"], + chunk_shape=tuple(meta["chunk_grid"]["chunk_shape"]), + separator=meta["chunk_grid"]["separator"], + ), + data_type=cls.encode_dtype(dtype), + fill_value=encode_fill_value(meta["fill_value"], dtype, object_codec), + chunk_memory_layout=meta["chunk_memory_layout"], + attributes=meta.get("attributes", {}), + extensions=extensions, + ) + if compressor: + meta["compressor"] = compressor + if dimension_separator: + meta["dimension_separator"] = dimension_separator + return json_dumps(meta) + + parse_metadata = Metadata2.parse_metadata decode_array_metadata = Metadata2.decode_array_metadata encode_array_metadata = Metadata2.encode_array_metadata diff --git a/zarr/storage.py b/zarr/storage.py index 35e1fdb0a2..709bbba7ee 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -35,6 +35,7 @@ import uuid import time +from numcodecs.abc import Codec from numcodecs.compat import ( ensure_bytes, ensure_text, @@ -42,6 +43,7 @@ ) from numcodecs.registry import codec_registry +from zarr._storage.store import DEFAULT_ZARR_VERSION from zarr.errors import ( MetadataError, BadCompressorError, @@ -56,16 +58,25 @@ normalize_dtype, normalize_fill_value, normalize_order, normalize_shape, normalize_storage_path, retry_call) -from zarr._storage.absstore import ABSStore # noqa: F401 -from zarr._storage.store import (_listdir_from_keys, - _path_to_prefix, +from zarr._storage.absstore import ABSStore, ABSStoreV3 # noqa: F401 +from zarr._storage.store import (_get_hierarchy_metadata, # noqa: F401 + _get_metadata_suffix, + _listdir_from_keys, _rename_from_keys, + _rename_metadata_v3, _rmdir_from_keys, + _rmdir_from_keys_v3, + _path_to_prefix, + _prefix_to_array_key, + _prefix_to_group_key, array_meta_key, - group_meta_key, attrs_key, + data_root, + group_meta_key, + meta_root, BaseStore, - Store) + Store, + StoreV3) __doctest_requires__ = { ('RedisStore', 'RedisStore.*'): ['redis'], @@ -92,39 +103,91 @@ def contains_array(store: StoreLike, path: Path = None) -> bool: """Return True if the store contains an array at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + array_meta_key + key = _prefix_to_array_key(store, prefix) return key in store -def contains_group(store: StoreLike, path: Path = None) -> bool: +def contains_group(store: StoreLike, path: Path = None, explicit_only=True) -> bool: """Return True if the store contains a group at the given logical path.""" path = normalize_storage_path(path) prefix = _path_to_prefix(path) - key = prefix + group_meta_key - return key in store + key = _prefix_to_group_key(store, prefix) + store_version = getattr(store, '_store_version', 2) + if store_version == 2 or explicit_only: + return key in store + else: + if key in store: + return True + # for v3, need to also handle implicit groups + sfx = _get_metadata_suffix(store) # type: ignore + implicit_prefix = key.replace('.group' + sfx, '') + if not implicit_prefix.endswith('/'): + implicit_prefix += '/' + if store.list_prefix(implicit_prefix): # type: ignore + return True + return False -def normalize_store_arg(store: Any, storage_options=None, mode="r") -> BaseStore: + +def normalize_store_arg(store: Any, storage_options=None, mode="r", *, + zarr_version=None) -> BaseStore: + if zarr_version is None: + # default to v2 store for backward compatibility + zarr_version = getattr(store, '_store_version', DEFAULT_ZARR_VERSION) + if zarr_version not in [2, 3]: + raise ValueError("zarr_version must be 2 or 3") if store is None: - return BaseStore._ensure_store(dict()) - elif isinstance(store, os.PathLike): - store = os.fspath(store) - if isinstance(store, str): - if "://" in store or "::" in store: - return FSStore(store, mode=mode, **(storage_options or {})) - elif storage_options: - raise ValueError("storage_options passed with non-fsspec path") - if store.endswith('.zip'): - return ZipStore(store, mode=mode) - elif store.endswith('.n5'): - from zarr.n5 import N5Store - return N5Store(store) + if zarr_version == 2: + store = KVStore(dict()) else: - return DirectoryStore(store) - else: - if not isinstance(store, BaseStore) and isinstance(store, MutableMapping): - store = BaseStore._ensure_store(store) + store = KVStoreV3(dict()) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) return store + elif hasattr(store, '_store_version') and store._store_version != zarr_version: + raise ValueError( + f"store is a zarr v{store._store_version} store which conflicts " + f"with the specified zarr_version ({zarr_version})." + ) + + if isinstance(store, os.PathLike): + store = os.fspath(store) + if isinstance(store, str): + if zarr_version == 2: + if "://" in store or "::" in store: + return FSStore(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + if store.endswith('.zip'): + return ZipStore(store, mode=mode) + elif store.endswith('.n5'): + from zarr.n5 import N5Store + return N5Store(store) + else: + return DirectoryStore(store) + elif zarr_version == 3: + if "://" in store or "::" in store: + store = FSStoreV3(store, mode=mode, **(storage_options or {})) + elif storage_options: + raise ValueError("storage_options passed with non-fsspec path") + elif store.endswith('.zip'): + store = ZipStoreV3(store, mode=mode) + elif store.endswith('.n5'): + raise NotImplementedError("N5Store not yet implemented for V3") + # return N5StoreV3(store) + else: + store = DirectoryStoreV3(store) + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store + elif zarr_version == 2: + store = Store._ensure_store(store) + elif zarr_version == 3: + store = StoreV3._ensure_store(store) + if 'zarr.json' not in store: + # add default zarr.json metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) + return store def rmdir(store: StoreLike, path: Path = None): @@ -132,15 +195,19 @@ def rmdir(store: StoreLike, path: Path = None): this will be called, otherwise will fall back to implementation via the `Store` interface.""" path = normalize_storage_path(path) + store_version = getattr(store, '_store_version', 2) if hasattr(store, "rmdir") and store.is_erasable(): # type: ignore # pass through store.rmdir(path) # type: ignore else: # slow version, delete one key at a time - _rmdir_from_keys(store, path) + if store_version == 2: + _rmdir_from_keys(store, path) + else: + _rmdir_from_keys_v3(store, path) # type: ignore -def rename(store: BaseStore, src_path: Path, dst_path: Path): +def rename(store: Store, src_path: Path, dst_path: Path): """Rename all items under the given path. If `store` provides a `rename` method, this will be called, otherwise will fall back to implementation via the `Store` interface.""" @@ -172,33 +239,45 @@ def listdir(store: BaseStore, path: Path = None): return _listdir_from_keys(store, path) +def _getsize(store: BaseStore, path: Path = None) -> int: + # compute from size of values + if path and path in store: + v = store[path] + size = buffer_size(v) + else: + path = '' if path is None else normalize_storage_path(path) + size = 0 + store_version = getattr(store, '_store_version', 2) + if store_version == 3: + members = store.list_prefix(data_root + path) # type: ignore + members += store.list_prefix(meta_root + path) # type: ignore + # members += ['zarr.json'] + else: + members = listdir(store, path) + prefix = _path_to_prefix(path) + members = [prefix + k for k in members] + for k in members: + try: + v = store[k] + except KeyError: + pass + else: + try: + size += buffer_size(v) + except TypeError: + return -1 + return size + + def getsize(store: BaseStore, path: Path = None) -> int: """Compute size of stored items for a given path. If `store` provides a `getsize` method, this will be called, otherwise will return -1.""" - path = normalize_storage_path(path) if hasattr(store, 'getsize'): # pass through + path = normalize_storage_path(path) return store.getsize(path) # type: ignore elif isinstance(store, MutableMapping): - # compute from size of values - if path in store: - v = store[path] - size = buffer_size(v) - else: - members = listdir(store, path) - prefix = _path_to_prefix(path) - size = 0 - for k in members: - try: - v = store[prefix + k] - except KeyError: - pass - else: - try: - size += buffer_size(v) - except TypeError: - return -1 - return size + return _getsize(store, path) else: return -1 @@ -345,8 +424,18 @@ def init_array( path = normalize_storage_path(path) # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, overwrite=overwrite) + store_version = getattr(store, "_store_version", 2) + if store_version < 3: + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore + if not compressor: + # compatibility with legacy tests using compressor=[] + compressor = None _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, overwrite=overwrite, path=path, @@ -371,16 +460,50 @@ def _init_array_metadata( dimension_separator=None, ): + store_version = getattr(store, '_store_version', 2) + + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing array in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = data_root + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if '/' in path: + # path is a subfolder of an existing array, remove that array + parent_path = '/'.join(path.split('/')[:-1]) + sfx = _get_metadata_suffix(store) # type: ignore + array_key = meta_root + parent_path + '.array' + sfx + if array_key in store: + store.erase(array_key) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path, explicit_only=False): + raise ContainsGroupError(path) + elif store_version == 3: + if '/' in path: + # cannot create an array within an existing array path + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # normalize metadata dtype, object_codec = normalize_dtype(dtype, object_codec) @@ -391,7 +514,7 @@ def _init_array_metadata( fill_value = normalize_fill_value(fill_value, dtype) # optional array metadata - if dimension_separator is None: + if dimension_separator is None and store_version == 2: dimension_separator = getattr(store, "_dimension_separator", None) dimension_separator = normalize_dimension_separator(dimension_separator) @@ -408,13 +531,21 @@ def _init_array_metadata( # obtain compressor config compressor_config = None if compressor: - try: - compressor_config = compressor.get_config() - except AttributeError as e: - raise BadCompressorError(compressor) from e + if store_version == 2: + try: + compressor_config = compressor.get_config() + except AttributeError as e: + raise BadCompressorError(compressor) from e + elif not isinstance(compressor, Codec): + raise ValueError("expected a numcodecs Codec for compressor") + # TODO: alternatively, could autoconvert str to a Codec + # e.g. 'zlib' -> numcodec.Zlib object + # compressor = numcodecs.get_codec({'id': compressor}) # obtain filters config if filters: + # TODO: filters was removed from the metadata in v3 + # raise error here if store_version > 2? filters_config = [f.get_config() for f in filters] else: filters_config = [] @@ -440,11 +571,31 @@ def _init_array_metadata( filters_config = None # type: ignore # initialize metadata - meta = dict(shape=shape, chunks=chunks, dtype=dtype, - compressor=compressor_config, fill_value=fill_value, - order=order, filters=filters_config, + # TODO: don't store redundant dimension_separator for v3? + _compressor = compressor_config if store_version == 2 else compressor + meta = dict(shape=shape, compressor=_compressor, + fill_value=fill_value, dimension_separator=dimension_separator) - key = _path_to_prefix(path) + array_meta_key + if store_version < 3: + meta.update(dict(chunks=chunks, dtype=dtype, order=order, + filters=filters_config)) + else: + if dimension_separator is None: + dimension_separator = "/" + if filters_config: + attributes = {'filters': filters_config} + else: + attributes = {} + meta.update( + dict(chunk_grid=dict(type="regular", + chunk_shape=chunks, + separator=dimension_separator), + chunk_memory_layout=order, + data_type=dtype, + attributes=attributes) + ) + + key = _prefix_to_array_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_array_metadata(meta) # type: ignore else: @@ -481,14 +632,26 @@ def init_group( # normalize path path = normalize_storage_path(path) - # ensure parent group initialized - _require_parent_group(path, store=store, chunk_store=chunk_store, - overwrite=overwrite) + store_version = getattr(store, '_store_version', 2) + if store_version < 3: + # ensure parent group initialized + _require_parent_group(path, store=store, chunk_store=chunk_store, + overwrite=overwrite) + + if store_version == 3 and 'zarr.json' not in store: + # initialize with default zarr.json entry level metadata + store['zarr.json'] = store._metadata_class.encode_hierarchy_metadata(None) # type: ignore # initialise metadata _init_group_metadata(store=store, overwrite=overwrite, path=path, chunk_store=chunk_store) + if store_version == 3: + # TODO: Should initializing a v3 group also create a corresponding + # empty folder under data/root/? I think probably not until there + # is actual data written there. + pass + def _init_group_metadata( store: StoreLike, @@ -497,22 +660,51 @@ def _init_group_metadata( chunk_store: StoreLike = None, ): + store_version = getattr(store, '_store_version', 2) + path = normalize_storage_path(path) + # guard conditions if overwrite: - # attempt to delete any pre-existing items in store - rmdir(store, path) - if chunk_store is not None: - rmdir(chunk_store, path) - elif contains_array(store, path): - raise ContainsArrayError(path) - elif contains_group(store, path): - raise ContainsGroupError(path) + if store_version == 2: + # attempt to delete any pre-existing items in store + rmdir(store, path) + if chunk_store is not None: + rmdir(chunk_store, path) + else: + group_meta_key = _prefix_to_group_key(store, _path_to_prefix(path)) + array_meta_key = _prefix_to_array_key(store, _path_to_prefix(path)) + data_prefix = data_root + _path_to_prefix(path) + meta_prefix = meta_root + _path_to_prefix(path) + + # attempt to delete any pre-existing array in store + if array_meta_key in store: + store.erase(array_meta_key) # type: ignore + if group_meta_key in store: + store.erase(group_meta_key) # type: ignore + store.erase_prefix(data_prefix) # type: ignore + store.erase_prefix(meta_prefix) # type: ignore + if chunk_store is not None: + chunk_store.erase_prefix(data_prefix) # type: ignore + + if not overwrite: + if contains_array(store, path): + raise ContainsArrayError(path) + elif contains_group(store, path): + raise ContainsGroupError(path) + elif store_version == 3 and '/' in path: + # cannot create a group overlapping with an existing array name + parent_path = '/'.join(path.split('/')[:-1]) + if contains_array(store, parent_path): + raise ContainsArrayError(path) # initialize metadata # N.B., currently no metadata properties are needed, however there may # be in future - meta = dict() # type: ignore - key = _path_to_prefix(path) + group_meta_key + if store_version == 3: + meta = {'attributes': {}} # type: ignore + else: + meta = {} # type: ignore + key = _prefix_to_group_key(store, _path_to_prefix(path)) if hasattr(store, '_metadata_class'): store[key] = store._metadata_class.encode_group_metadata(meta) # type: ignore else: @@ -1132,14 +1324,17 @@ def __init__(self, url, normalize_keys=False, key_separator=None, dimension_separator = key_separator self.key_separator = dimension_separator - if self.key_separator is None: - self.key_separator = "." + self._default_key_separator() # Pass attributes to array creation self._dimension_separator = dimension_separator if self.fs.exists(self.path) and not self.fs.isdir(self.path): raise FSPathExistNotDir(url) + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "." + def _normalize_key(self, key): key = normalize_storage_path(key).lstrip('/') if key: @@ -1886,6 +2081,10 @@ def __contains__(self, key): key = key.encode("ascii") return key in self.db + def rmdir(self, path: str = "") -> None: + path = normalize_storage_path(path) + _rmdir_from_keys(self, path) + class LMDBStore(Store): """Storage class using LMDB. Requires the `lmdb `_ @@ -2641,7 +2840,7 @@ def __init__(self, store: StoreLike, metadata_key=".zmetadata"): self.store = Store._ensure_store(store) # retrieve consolidated metadata - meta = json_loads(store[metadata_key]) + meta = json_loads(self.store[metadata_key]) # check format of consolidated metadata consolidated_format = meta.get('zarr_consolidated_format', None) @@ -2675,3 +2874,496 @@ def getsize(self, path): def listdir(self, path): return listdir(self.meta_store, path) + + +""" versions of stores following the v3 protocol """ + + +def _get_files_and_dirs_from_path(store, path): + path = normalize_storage_path(path) + + files = [] + # add array metadata file if present + array_key = _prefix_to_array_key(store, path) + if array_key in store: + files.append(os.path.join(store.path, array_key)) + + # add group metadata file if present + group_key = _prefix_to_group_key(store, path) + if group_key in store: + files.append(os.path.join(store.path, group_key)) + + dirs = [] + # add array and group folders if present + for d in [data_root + path, meta_root + path]: + dir_path = os.path.join(store.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + return files, dirs + + +class RmdirV3(): + """Mixin class that can be used to ensure override of any existing v2 rmdir class.""" + + def rmdir(self, path: str = "") -> None: + path = normalize_storage_path(path) + _rmdir_from_keys_v3(self, path) # type: ignore + + +class KVStoreV3(RmdirV3, KVStore, StoreV3): + + def list(self): + return list(self._mutable_mapping.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def __eq__(self, other): + return ( + isinstance(other, KVStoreV3) and + self._mutable_mapping == other._mutable_mapping + ) + + +KVStoreV3.__doc__ = KVStore.__doc__ + + +class FSStoreV3(FSStore, StoreV3): + + # FSStoreV3 doesn't use this (FSStore uses it within _normalize_key) + _META_KEYS = () + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def _default_key_separator(self): + if self.key_separator is None: + self.key_separator = "/" + + def list(self): + return list(self.keys()) + + def _normalize_key(self, key): + key = normalize_storage_path(key).lstrip('/') + return key.lower() if self.normalize_keys else key + + def getsize(self, path=None): + size = 0 + if path is None or path == '': + # size of both the data and meta subdirs + dirs = [] + for d in ['data/root', 'meta/root']: + dir_path = os.path.join(self.path, d) + if os.path.exists(dir_path): + dirs.append(dir_path) + elif path in self: + # access individual element by full path + return buffer_size(self[path]) + else: + files, dirs = _get_files_and_dirs_from_path(self, path) + for file in files: + size += os.path.getsize(file) + for d in dirs: + size += self.fs.du(d, total=True, maxdepth=None) + return size + + def setitems(self, values): + if self.mode == 'r': + raise ReadOnlyError() + values = {self._normalize_key(key): val for key, val in values.items()} + + # initialize the /data/root/... folder corresponding to the array! + # Note: zarr.tests.test_core_v3.TestArrayWithFSStoreV3PartialRead fails + # without this explicit creation of directories + subdirectories = set([os.path.dirname(v) for v in values.keys()]) + for subdirectory in subdirectories: + data_dir = os.path.join(self.path, subdirectory) + if not self.fs.exists(data_dir): + self.fs.mkdir(data_dir) + + self.map.setitems(values) + + def rmdir(self, path=None): + if self.mode == 'r': + raise ReadOnlyError() + if path: + for base in [meta_root, data_root]: + store_path = self.dir_path(base + path) + if self.fs.isdir(store_path): + self.fs.rm(store_path, recursive=True) + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip('/') + array_meta_file = meta_dir + '.array' + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + '.group' + sfx + self.pop(group_meta_file, None) + else: + store_path = self.dir_path(path) + if self.fs.isdir(store_path): + self.fs.rm(store_path, recursive=True) + + +class MemoryStoreV3(MemoryStore, StoreV3): + + def __init__(self, root=None, cls=dict, dimension_separator=None): + if root is None: + self.root = cls() + else: + self.root = root + self.cls = cls + self.write_mutex = Lock() + self._dimension_separator = dimension_separator # TODO: modify for v3? + + def __eq__(self, other): + return ( + isinstance(other, MemoryStoreV3) and + self.root == other.root and + self.cls == other.cls + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def list(self): + return list(self.keys()) + + def getsize(self, path: Path = None): + return _getsize(self, path) + + def rename(self, src_path: Path, dst_path: Path): + src_path = normalize_storage_path(src_path) + dst_path = normalize_storage_path(dst_path) + + any_renamed = False + for base in [meta_root, data_root]: + if self.list_prefix(base + src_path): + src_parent, src_key = self._get_parent(base + src_path) + dst_parent, dst_key = self._require_parent(base + dst_path) + + dst_parent[dst_key] = src_parent.pop(src_key) + any_renamed = True + any_renamed = _rename_metadata_v3(self, src_path, dst_path) or any_renamed + if not any_renamed: + raise ValueError(f"no item {src_path} found to rename") + + def rmdir(self, path: Path = None): + path = normalize_storage_path(path) + if path: + for base in [meta_root, data_root]: + try: + parent, key = self._get_parent(base + path) + value = parent[key] + except KeyError: + continue + else: + if isinstance(value, self.cls): + del parent[key] + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip('/') + array_meta_file = meta_dir + '.array' + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + '.group' + sfx + self.pop(group_meta_file, None) + else: + # clear out root + self.root = self.cls() + + +MemoryStoreV3.__doc__ = MemoryStore.__doc__ + + +class DirectoryStoreV3(DirectoryStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, DirectoryStoreV3) and + self.path == other.path + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def getsize(self, path: Path = None): + return _getsize(self, path) + + def rename(self, src_path, dst_path, metadata_key_suffix='.json'): + store_src_path = normalize_storage_path(src_path) + store_dst_path = normalize_storage_path(dst_path) + + dir_path = self.path + any_existed = False + for root_prefix in ['meta', 'data']: + src_path = os.path.join(dir_path, root_prefix, 'root', store_src_path) + if os.path.exists(src_path): + any_existed = True + dst_path = os.path.join(dir_path, root_prefix, 'root', store_dst_path) + os.renames(src_path, dst_path) + + for suffix in ['.array' + metadata_key_suffix, + '.group' + metadata_key_suffix]: + src_meta = os.path.join(dir_path, 'meta', 'root', store_src_path + suffix) + if os.path.exists(src_meta): + any_existed = True + dst_meta = os.path.join(dir_path, 'meta', 'root', store_dst_path + suffix) + dst_dir = os.path.dirname(dst_meta) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + os.rename(src_meta, dst_meta) + if not any_existed: + raise FileNotFoundError("nothing found at src_path") + + def rmdir(self, path=None): + store_path = normalize_storage_path(path) + dir_path = self.path + if store_path: + for base in [meta_root, data_root]: + dir_path = os.path.join(dir_path, base + store_path) + if os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip('/') + array_meta_file = meta_dir + '.array' + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + '.group' + sfx + self.pop(group_meta_file, None) + + elif os.path.isdir(dir_path): + shutil.rmtree(dir_path) + + +DirectoryStoreV3.__doc__ = DirectoryStore.__doc__ + + +class ZipStoreV3(ZipStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __eq__(self, other): + return ( + isinstance(other, ZipStore) and + self.path == other.path and + self.compression == other.compression and + self.allowZip64 == other.allowZip64 + ) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def getsize(self, path=None): + path = normalize_storage_path(path) + with self.mutex: + children = self.list_prefix(data_root + path) + children += self.list_prefix(meta_root + path) + print(f"path={path}, children={children}") + if children: + size = 0 + for name in children: + info = self.zf.getinfo(name) + size += info.compress_size + return size + elif path in self: + info = self.zf.getinfo(path) + return info.compress_size + else: + return 0 + + +ZipStoreV3.__doc__ = ZipStore.__doc__ + + +class RedisStoreV3(RmdirV3, RedisStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +RedisStoreV3.__doc__ = RedisStore.__doc__ + + +class MongoDBStoreV3(RmdirV3, MongoDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +MongoDBStoreV3.__doc__ = MongoDBStore.__doc__ + + +class DBMStoreV3(RmdirV3, DBMStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +DBMStoreV3.__doc__ = DBMStore.__doc__ + + +class LMDBStoreV3(RmdirV3, LMDBStore, StoreV3): + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +LMDBStoreV3.__doc__ = LMDBStore.__doc__ + + +class SQLiteStoreV3(SQLiteStore, StoreV3): + + def list(self): + return list(self.keys()) + + def getsize(self, path=None): + # TODO: why does the query below not work in this case? + # For now fall back to the default _getsize implementation + # size = 0 + # for _path in [data_root + path, meta_root + path]: + # c = self.cursor.execute( + # ''' + # SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr + # WHERE k LIKE (? || "%") AND + # 0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/") + # ''', + # (_path, _path) + # ) + # for item_size, in c: + # size += item_size + # return size + + # fallback to default implementation for now + return _getsize(self, path) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + def rmdir(self, path=None): + path = normalize_storage_path(path) + if path: + for base in [meta_root, data_root]: + with self.lock: + self.cursor.execute( + 'DELETE FROM zarr WHERE k LIKE (? || "/%")', (base + path,) + ) + # remove any associated metadata files + sfx = _get_metadata_suffix(self) + meta_dir = (meta_root + path).rstrip('/') + array_meta_file = meta_dir + '.array' + sfx + self.pop(array_meta_file, None) + group_meta_file = meta_dir + '.group' + sfx + self.pop(group_meta_file, None) + else: + self.clear() + + +SQLiteStoreV3.__doc__ = SQLiteStore.__doc__ + + +class LRUStoreCacheV3(RmdirV3, LRUStoreCache, StoreV3): + + def __init__(self, store, max_size: int): + self._store = StoreV3._ensure_store(store) + self._max_size = max_size + self._current_size = 0 + self._keys_cache = None + self._contains_cache = None + self._listdir_cache: Dict[Path, Any] = dict() + self._values_cache: Dict[Path, Any] = OrderedDict() + self._mutex = Lock() + self.hits = self.misses = 0 + + def list(self): + return list(self.keys()) + + def __setitem__(self, key, value): + self._validate_key(key) + super().__setitem__(key, value) + + +LRUStoreCacheV3.__doc__ = LRUStoreCache.__doc__ + + +class ConsolidatedMetadataStoreV3(ConsolidatedMetadataStore, StoreV3): + """A layer over other storage, where the metadata has been consolidated into + a single key. + + The purpose of this class, is to be able to get all of the metadata for + a given array in a single read operation from the underlying storage. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the array metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. note:: This is an experimental feature. + + Parameters + ---------- + store: Store + Containing the zarr array. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + + """ + + def __init__(self, store: StoreLike, metadata_key=meta_root + "consolidated/.zmetadata"): + self.store = StoreV3._ensure_store(store) + + # retrieve consolidated metadata + meta = json_loads(self.store[metadata_key]) + + # check format of consolidated metadata + consolidated_format = meta.get('zarr_consolidated_format', None) + if consolidated_format != 1: + raise MetadataError('unsupported zarr consolidated metadata format: %s' % + consolidated_format) + + # decode metadata + self.meta_store: Store = KVStoreV3(meta["metadata"]) + + def rmdir(self, key): + raise ReadOnlyError() + + # def __setitem__(self, key, value): + # raise ReadOnlyError() diff --git a/zarr/tests/data/store.zip b/zarr/tests/data/store.zip index a36fd675b3..76ba856c62 100644 Binary files a/zarr/tests/data/store.zip and b/zarr/tests/data/store.zip differ diff --git a/zarr/tests/test_attrs.py b/zarr/tests/test_attrs.py index b2de736d4a..dbbc19328a 100644 --- a/zarr/tests/test_attrs.py +++ b/zarr/tests/test_attrs.py @@ -2,24 +2,36 @@ import pytest +from zarr._storage.store import meta_root from zarr.attrs import Attributes -from zarr.tests.util import CountingDict -from zarr.storage import KVStore +from zarr.storage import KVStore, KVStoreV3 +from zarr.tests.util import CountingDict, CountingDictV3 + + +@pytest.fixture(params=[2, 3]) +def zarr_version(request): + return request.param + + +def _init_store(version): + """Use a plain dict() for v2, but KVStoreV3 otherwise.""" + if version == 2: + return dict() + return KVStoreV3(dict()) class TestAttributes(): - def init_attributes(self, store, read_only=False, cache=True): - return Attributes(store, key='attrs', read_only=read_only, cache=cache) + def init_attributes(self, store, read_only=False, cache=True, zarr_version=2): + root = '.z' if zarr_version == 2 else meta_root + return Attributes(store, key=root + 'attrs', read_only=read_only, cache=cache) - @pytest.mark.parametrize('store_from_dict', [False, True]) - def test_storage(self, store_from_dict): + def test_storage(self, zarr_version): - if store_from_dict: - store = dict() - else: - store = KVStore(dict()) - a = Attributes(store=store, key='attrs') + store = _init_store(zarr_version) + root = '.z' if zarr_version == 2 else meta_root + attrs_key = root + 'attrs' + a = Attributes(store=store, key=attrs_key) assert isinstance(a.store, KVStore) assert 'foo' not in a assert 'bar' not in a @@ -27,14 +39,17 @@ def test_storage(self, store_from_dict): a['foo'] = 'bar' a['baz'] = 42 - assert 'attrs' in store - assert isinstance(store['attrs'], bytes) - d = json.loads(str(store['attrs'], 'ascii')) + assert attrs_key in store + assert isinstance(store[attrs_key], bytes) + d = json.loads(str(store[attrs_key], 'ascii')) + if zarr_version == 3: + d = d['attributes'] assert dict(foo='bar', baz=42) == d - def test_get_set_del_contains(self): + def test_get_set_del_contains(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) assert 'foo' not in a a['foo'] = 'bar' a['baz'] = 42 @@ -48,9 +63,10 @@ def test_get_set_del_contains(self): # noinspection PyStatementEffect a['foo'] - def test_update_put(self): + def test_update_put(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) assert 'foo' not in a assert 'bar' not in a assert 'baz' not in a @@ -65,9 +81,10 @@ def test_update_put(self): assert a['bar'] == 84 assert 'baz' not in a - def test_iterators(self): + def test_iterators(self, zarr_version): - a = self.init_attributes(dict()) + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) assert 0 == len(a) assert set() == set(a) assert set() == set(a.keys()) @@ -83,10 +100,15 @@ def test_iterators(self): assert {'bar', 42} == set(a.values()) assert {('foo', 'bar'), ('baz', 42)} == set(a.items()) - def test_read_only(self): - store = dict() - a = self.init_attributes(store, read_only=True) - store['attrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + def test_read_only(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, read_only=True, zarr_version=zarr_version) + if zarr_version == 2: + store['.zattrs'] = json.dumps(dict(foo='bar', baz=42)).encode('ascii') + else: + store['meta/root/attrs'] = json.dumps( + dict(attributes=dict(foo='bar', baz=42)) + ).encode('ascii') assert a['foo'] == 'bar' assert a['baz'] == 42 with pytest.raises(PermissionError): @@ -96,8 +118,9 @@ def test_read_only(self): with pytest.raises(PermissionError): a.update(foo='quux') - def test_key_completions(self): - a = self.init_attributes(dict()) + def test_key_completions(self, zarr_version): + store = _init_store(zarr_version) + a = self.init_attributes(store, zarr_version=zarr_version) d = a._ipython_key_completions_() assert 'foo' not in d assert '123' not in d @@ -112,113 +135,135 @@ def test_key_completions(self): assert 'asdf;' in d assert 'baz' not in d - def test_caching_on(self): + def test_caching_on(self, zarr_version): # caching is turned on by default # setup store - store = CountingDict() - assert 0 == store.counter['__getitem__', 'attrs'] - assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') - assert 0 == store.counter['__getitem__', 'attrs'] - assert 1 == store.counter['__setitem__', 'attrs'] + store = CountingDict() if zarr_version == 2 else CountingDictV3() + attrs_key = '.zattrs' if zarr_version == 2 else 'meta/root/attrs' + assert 0 == store.counter['__getitem__', attrs_key] + assert 0 == store.counter['__setitem__', attrs_key] + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') + assert 0 == store.counter['__getitem__', attrs_key] + assert 1 == store.counter['__setitem__', attrs_key] # setup attributes - a = self.init_attributes(store) + a = self.init_attributes(store, zarr_version=zarr_version) # test __getitem__ causes all attributes to be cached assert a['foo'] == 'xxx' - assert 1 == store.counter['__getitem__', 'attrs'] + assert 1 == store.counter['__getitem__', attrs_key] assert a['bar'] == 42 - assert 1 == store.counter['__getitem__', 'attrs'] + assert 1 == store.counter['__getitem__', attrs_key] assert a['foo'] == 'xxx' - assert 1 == store.counter['__getitem__', 'attrs'] + assert 1 == store.counter['__getitem__', attrs_key] # test __setitem__ updates the cache a['foo'] = 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] - assert 2 == store.counter['__setitem__', 'attrs'] + get_cnt = 2 if zarr_version == 2 else 3 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 2 == store.counter['__setitem__', attrs_key] assert a['foo'] == 'yyy' - assert 2 == store.counter['__getitem__', 'attrs'] - assert 2 == store.counter['__setitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 2 == store.counter['__setitem__', attrs_key] # test update() updates the cache a.update(foo='zzz', bar=84) - assert 3 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + get_cnt = 3 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 3 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] # test __contains__ uses the cache assert 'foo' in a - assert 3 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] assert 'spam' not in a - assert 3 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] # test __delitem__ updates the cache del a['bar'] - assert 4 == store.counter['__getitem__', 'attrs'] - assert 4 == store.counter['__setitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 7 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 4 == store.counter['__setitem__', attrs_key] assert 'bar' not in a - assert 4 == store.counter['__getitem__', 'attrs'] - assert 4 == store.counter['__setitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 4 == store.counter['__setitem__', attrs_key] # test refresh() - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') - assert 4 == store.counter['__getitem__', 'attrs'] + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') + assert get_cnt == store.counter['__getitem__', attrs_key] a.refresh() - assert 5 == store.counter['__getitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', attrs_key] assert a['foo'] == 'xxx' - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] assert a['bar'] == 42 - assert 5 == store.counter['__getitem__', 'attrs'] + assert get_cnt == store.counter['__getitem__', attrs_key] - def test_caching_off(self): + def test_caching_off(self, zarr_version): # setup store - store = CountingDict() - assert 0 == store.counter['__getitem__', 'attrs'] - assert 0 == store.counter['__setitem__', 'attrs'] - store['attrs'] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') - assert 0 == store.counter['__getitem__', 'attrs'] - assert 1 == store.counter['__setitem__', 'attrs'] + store = CountingDict() if zarr_version == 2 else CountingDictV3() + attrs_key = '.zattrs' if zarr_version == 2 else 'meta/root/attrs' + assert 0 == store.counter['__getitem__', attrs_key] + assert 0 == store.counter['__setitem__', attrs_key] + + if zarr_version == 2: + store[attrs_key] = json.dumps(dict(foo='xxx', bar=42)).encode('ascii') + else: + store[attrs_key] = json.dumps(dict(attributes=dict(foo='xxx', bar=42))).encode('ascii') + assert 0 == store.counter['__getitem__', attrs_key] + assert 1 == store.counter['__setitem__', attrs_key] # setup attributes - a = self.init_attributes(store, cache=False) + a = self.init_attributes(store, cache=False, zarr_version=zarr_version) # test __getitem__ assert a['foo'] == 'xxx' - assert 1 == store.counter['__getitem__', 'attrs'] + assert 1 == store.counter['__getitem__', attrs_key] assert a['bar'] == 42 - assert 2 == store.counter['__getitem__', 'attrs'] + assert 2 == store.counter['__getitem__', attrs_key] assert a['foo'] == 'xxx' - assert 3 == store.counter['__getitem__', 'attrs'] + assert 3 == store.counter['__getitem__', attrs_key] # test __setitem__ a['foo'] = 'yyy' - assert 4 == store.counter['__getitem__', 'attrs'] - assert 2 == store.counter['__setitem__', 'attrs'] + get_cnt = 4 if zarr_version == 2 else 5 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 2 == store.counter['__setitem__', attrs_key] assert a['foo'] == 'yyy' - assert 5 == store.counter['__getitem__', 'attrs'] - assert 2 == store.counter['__setitem__', 'attrs'] + get_cnt = 5 if zarr_version == 2 else 6 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 2 == store.counter['__setitem__', attrs_key] # test update() a.update(foo='zzz', bar=84) - assert 6 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + get_cnt = 6 if zarr_version == 2 else 8 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] assert a['foo'] == 'zzz' assert a['bar'] == 84 - assert 8 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + get_cnt = 8 if zarr_version == 2 else 10 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] # test __contains__ assert 'foo' in a - assert 9 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + get_cnt = 9 if zarr_version == 2 else 11 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] assert 'spam' not in a - assert 10 == store.counter['__getitem__', 'attrs'] - assert 3 == store.counter['__setitem__', 'attrs'] + get_cnt = 10 if zarr_version == 2 else 12 + assert get_cnt == store.counter['__getitem__', attrs_key] + assert 3 == store.counter['__setitem__', attrs_key] diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index a6041b788e..74c8d06fac 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -26,30 +26,54 @@ from zarr.hierarchy import Group, group from zarr.storage import ( ConsolidatedMetadataStore, + ConsolidatedMetadataStoreV3, + DirectoryStoreV3, + FSStoreV3, + KVStore, + KVStoreV3, MemoryStore, + MemoryStoreV3, + SQLiteStoreV3, atexit_rmtree, + data_root, + meta_root, getsize, ) +from zarr.tests.util import have_fsspec -def test_open_array(path_type): +def _init_creation_kwargs(zarr_version): + kwargs = {'zarr_version': zarr_version} + if zarr_version == 3: + kwargs['path'] = 'dataset' + return kwargs + + +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_open_array(path_type, zarr_version): store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) + kwargs = _init_creation_kwargs(zarr_version) # open array, create if doesn't exist - z = open(store, mode='a', shape=100) + z = open(store, mode='a', shape=100, **kwargs) assert isinstance(z, Array) assert z.shape == (100,) # open array, overwrite - z = open(store, mode='w', shape=200) + z = open(store, mode='w', shape=200, **kwargs) assert isinstance(z, Array) assert z.shape == (200,) + if zarr_version == 3: + # cannot open a v3 array without path + with pytest.raises(ValueError): + open(store, mode='w', shape=200, zarr_version=3) + # open array, read-only - z = open(store, mode='r') + z = open(store, mode='r', **kwargs) assert isinstance(z, Array) assert z.shape == (200,) assert z.read_only @@ -59,44 +83,83 @@ def test_open_array(path_type): open('doesnotexist', mode='r') -def test_open_group(path_type): +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_open_group(path_type, zarr_version): store = tempfile.mkdtemp() atexit.register(atexit_rmtree, store) store = path_type(store) + kwargs = _init_creation_kwargs(zarr_version) # open group, create if doesn't exist - g = open(store, mode='a') + g = open(store, mode='a', **kwargs) g.create_group('foo') assert isinstance(g, Group) assert 'foo' in g # open group, overwrite - g = open(store, mode='w') + g = open(store, mode='w', **kwargs) assert isinstance(g, Group) assert 'foo' not in g + if zarr_version == 3: + # cannot open a v3 group without path + with pytest.raises(ValueError): + open(store, mode='w', zarr_version=3) + # open group, read-only - g = open(store, mode='r') + g = open(store, mode='r', **kwargs) assert isinstance(g, Group) assert g.read_only -def test_save_errors(): +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_save_errors(zarr_version): with pytest.raises(ValueError): # no arrays provided - save_group('data/group.zarr') + save_group('data/group.zarr', zarr_version=zarr_version) + with pytest.raises(TypeError): + # no array provided + save_array('data/group.zarr', zarr_version=zarr_version) with pytest.raises(ValueError): # no arrays provided - save('data/group.zarr') + save('data/group.zarr', zarr_version=zarr_version) + + +def test_zarr_v3_save_multiple_unnamed(): + x = np.ones(8) + y = np.zeros(8) + store = KVStoreV3(dict()) + # no path provided + save_group(store, x, y, path='dataset', zarr_version=3) + # names become arr_{i} for unnamed *args + assert data_root + 'dataset/arr_0/c0' in store + assert data_root + 'dataset/arr_1/c0' in store + assert meta_root + 'dataset/arr_0.array.json' in store + assert meta_root + 'dataset/arr_1.array.json' in store -def test_lazy_loader(): +def test_zarr_v3_save_errors(): + x = np.ones(8) + with pytest.raises(ValueError): + # no path provided + save_group('data/group.zr3', x, zarr_version=3) + with pytest.raises(ValueError): + # no path provided + save_array('data/group.zr3', x, zarr_version=3) + with pytest.raises(ValueError): + # no path provided + save('data/group.zr3', x, zarr_version=3) + + +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_lazy_loader(zarr_version): foo = np.arange(100) bar = np.arange(100, 0, -1) - store = 'data/group.zarr' - save(store, foo=foo, bar=bar) - loader = load(store) + store = 'data/group.zarr' if zarr_version == 2 else 'data/group.zr3' + kwargs = _init_creation_kwargs(zarr_version) + save(store, foo=foo, bar=bar, **kwargs) + loader = load(store, **kwargs) assert 'foo' in loader assert 'bar' in loader assert 'baz' not in loader @@ -104,13 +167,58 @@ def test_lazy_loader(): assert sorted(loader) == ['bar', 'foo'] assert_array_equal(foo, loader['foo']) assert_array_equal(bar, loader['bar']) + assert 'LazyLoader: ' in repr(loader) -def test_consolidate_metadata(): +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_load_array(zarr_version): + foo = np.arange(100) + bar = np.arange(100, 0, -1) + store = 'data/group.zarr' if zarr_version == 2 else 'data/group.zr3' + kwargs = _init_creation_kwargs(zarr_version) + save(store, foo=foo, bar=bar, **kwargs) + + # can also load arrays directly into a numpy array + for array_name in ['foo', 'bar']: + array_path = 'dataset/' + array_name if zarr_version == 3 else array_name + array = load(store, path=array_path, zarr_version=zarr_version) + assert isinstance(array, np.ndarray) + if array_name == 'foo': + assert_array_equal(foo, array) + else: + assert_array_equal(bar, array) + + +@pytest.mark.parametrize("zarr_version", [2, 3]) +def test_tree(zarr_version): + kwargs = _init_creation_kwargs(zarr_version) + g1 = zarr.group(**kwargs) + g1.create_group('foo') + g3 = g1.create_group('bar') + g3.create_group('baz') + g5 = g3.create_group('qux') + g5.create_dataset('baz', shape=100, chunks=10) + assert repr(zarr.tree(g1)) == repr(g1.tree()) + assert str(zarr.tree(g1)) == str(g1.tree()) + + +# TODO: consolidated metadata currently only supported for v2 + +@pytest.mark.parametrize('zarr_version', [2, 3]) +@pytest.mark.parametrize('with_chunk_store', [False, True], ids=['default', 'with_chunk_store']) +def test_consolidate_metadata(with_chunk_store, zarr_version): + + if zarr_version == 2: + MemoryStoreClass = MemoryStore + path = '' + else: + MemoryStoreClass = MemoryStoreV3 + path = 'dataset' # setup initial data - store = MemoryStore() - z = group(store) + store = MemoryStoreClass() + chunk_store = MemoryStoreClass() if with_chunk_store else None + z = group(store, chunk_store=chunk_store, path=path) z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' @@ -121,20 +229,41 @@ def test_consolidate_metadata(): arr[:] = 1.0 assert 16 == arr.nchunks_initialized + if zarr_version == 3: + # error on v3 if path not provided + with pytest.raises(ValueError): + consolidate_metadata(store, path=None) + + with pytest.raises(ValueError): + consolidate_metadata(store, path='') + # perform consolidation - out = consolidate_metadata(store) + out = consolidate_metadata(store, path=path) assert isinstance(out, Group) - assert '.zmetadata' in store - for key in ['.zgroup', - 'g1/.zgroup', - 'g2/.zgroup', - 'g2/.zattrs', - 'g2/arr/.zarray', - 'g2/arr/.zattrs']: + assert ['g1', 'g2'] == list(out) + if zarr_version == 2: + assert isinstance(out._store, ConsolidatedMetadataStore) + assert '.zmetadata' in store + meta_keys = ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs'] + else: + assert isinstance(out._store, ConsolidatedMetadataStoreV3) + assert 'meta/root/consolidated/.zmetadata' in store + meta_keys = ['zarr.json', + meta_root + 'dataset.group.json', + meta_root + 'dataset/g1.group.json', + meta_root + 'dataset/g2.group.json', + meta_root + 'dataset/g2/arr.array.json', + 'meta/root/consolidated.group.json'] + for key in meta_keys: del store[key] # open consolidated - z2 = open_consolidated(store) + z2 = open_consolidated(store, chunk_store=chunk_store, path=path) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] @@ -143,11 +272,18 @@ def test_consolidate_metadata(): assert 16 == z2.g2.arr.nchunks_initialized # tests del/write on the store - cmd = ConsolidatedMetadataStore(store) - with pytest.raises(PermissionError): - del cmd['.zgroup'] - with pytest.raises(PermissionError): - cmd['.zgroup'] = None + if zarr_version == 2: + cmd = ConsolidatedMetadataStore(store) + with pytest.raises(PermissionError): + del cmd['.zgroup'] + with pytest.raises(PermissionError): + cmd['.zgroup'] = None + else: + cmd = ConsolidatedMetadataStoreV3(store) + with pytest.raises(PermissionError): + del cmd[meta_root + 'dataset.group.json'] + with pytest.raises(PermissionError): + cmd[meta_root + 'dataset.group.json'] = None # test getsize on the store assert isinstance(getsize(cmd), Integral) @@ -172,14 +308,16 @@ def test_consolidate_metadata(): # test invalid modes with pytest.raises(ValueError): - open_consolidated(store, mode='a') + open_consolidated(store, chunk_store=chunk_store, mode='a', path=path) with pytest.raises(ValueError): - open_consolidated(store, mode='w') + open_consolidated(store, chunk_store=chunk_store, mode='w', path=path) with pytest.raises(ValueError): - open_consolidated(store, mode='w-') + open_consolidated(store, chunk_store=chunk_store, mode='w-', path=path) # make sure keyword arguments are passed through without error - open_consolidated(store, cache_attrs=True, synchronizer=None) + open_consolidated( + store, chunk_store=chunk_store, path=path, cache_attrs=True, synchronizer=None + ) def test_consolidated_with_chunk_store(): @@ -247,6 +385,8 @@ def test_save_array_separator(tmpdir, options): class TestCopyStore(unittest.TestCase): + _version = 2 + def setUp(self): source = dict() source['foo'] = b'xxx' @@ -254,9 +394,12 @@ def setUp(self): source['bar/qux'] = b'zzz' self.source = source + def _get_dest_store(self): + return dict() + def test_no_paths(self): source = self.source - dest = dict() + dest = self._get_dest_store() copy_store(source, dest) assert len(source) == len(dest) for key in source: @@ -266,7 +409,7 @@ def test_source_path(self): source = self.source # paths should be normalized for source_path in 'bar', 'bar/', '/bar', '/bar/': - dest = dict() + dest = self._get_dest_store() copy_store(source, dest, source_path=source_path) assert 2 == len(dest) for key in source: @@ -280,11 +423,14 @@ def test_dest_path(self): source = self.source # paths should be normalized for dest_path in 'new', 'new/', '/new', '/new/': - dest = dict() + dest = self._get_dest_store() copy_store(source, dest, dest_path=dest_path) assert len(source) == len(dest) for key in source: - dest_key = 'new/' + key + if self._version == 3: + dest_key = key[:10] + 'new/' + key[10:] + else: + dest_key = 'new/' + key assert source[key] == dest[dest_key] def test_source_dest_path(self): @@ -292,7 +438,7 @@ def test_source_dest_path(self): # paths should be normalized for source_path in 'bar', 'bar/', '/bar', '/bar/': for dest_path in 'new', 'new/', '/new', '/new/': - dest = dict() + dest = self._get_dest_store() copy_store(source, dest, source_path=source_path, dest_path=dest_path) assert 2 == len(dest) @@ -308,41 +454,44 @@ def test_excludes_includes(self): source = self.source # single excludes - dest = dict() + dest = self._get_dest_store() excludes = 'f.*' copy_store(source, dest, excludes=excludes) assert len(dest) == 2 - assert 'foo' not in dest + + root = '' if self._version == 2 else meta_root + assert root + 'foo' not in dest # multiple excludes - dest = dict() + dest = self._get_dest_store() excludes = 'b.z', '.*x' copy_store(source, dest, excludes=excludes) assert len(dest) == 1 - assert 'foo' in dest - assert 'bar/baz' not in dest - assert 'bar/qux' not in dest + assert root + 'foo' in dest + assert root + 'bar/baz' not in dest + assert root + 'bar/qux' not in dest # excludes and includes - dest = dict() + dest = self._get_dest_store() excludes = 'b.*' includes = '.*x' copy_store(source, dest, excludes=excludes, includes=includes) assert len(dest) == 2 - assert 'foo' in dest - assert 'bar/baz' not in dest - assert 'bar/qux' in dest + assert root + 'foo' in dest + assert root + 'bar/baz' not in dest + assert root + 'bar/qux' in dest def test_dry_run(self): source = self.source - dest = dict() + dest = self._get_dest_store() copy_store(source, dest, dry_run=True) assert 0 == len(dest) def test_if_exists(self): source = self.source - dest = dict() - dest['bar/baz'] = b'mmm' + dest = self._get_dest_store() + root = '' if self._version == 2 else meta_root + dest[root + 'bar/baz'] = b'mmm' # default ('raise') with pytest.raises(CopyError): @@ -355,22 +504,43 @@ def test_if_exists(self): # skip copy_store(source, dest, if_exists='skip') assert 3 == len(dest) - assert dest['foo'] == b'xxx' - assert dest['bar/baz'] == b'mmm' - assert dest['bar/qux'] == b'zzz' + assert dest[root + 'foo'] == b'xxx' + assert dest[root + 'bar/baz'] == b'mmm' + assert dest[root + 'bar/qux'] == b'zzz' # replace copy_store(source, dest, if_exists='replace') assert 3 == len(dest) - assert dest['foo'] == b'xxx' - assert dest['bar/baz'] == b'yyy' - assert dest['bar/qux'] == b'zzz' + assert dest[root + 'foo'] == b'xxx' + assert dest[root + 'bar/baz'] == b'yyy' + assert dest[root + 'bar/qux'] == b'zzz' # invalid option with pytest.raises(ValueError): copy_store(source, dest, if_exists='foobar') +class TestCopyStoreV3(TestCopyStore): + + _version = 3 + + def setUp(self): + source = KVStoreV3(dict()) + source['meta/root/foo'] = b'xxx' + source['meta/root/bar/baz'] = b'yyy' + source['meta/root/bar/qux'] = b'zzz' + self.source = source + + def _get_dest_store(self): + return KVStoreV3(dict()) + + def test_mismatched_store_versions(self): + # cannot copy between stores of mixed Zarr versions + dest = KVStore(dict()) + with pytest.raises(ValueError): + copy_store(self.source, dest) + + def check_copied_array(original, copied, without_attrs=False, expect_props=None): @@ -419,7 +589,14 @@ def check_copied_array(original, copied, without_attrs=False, for k in original.attrs.keys(): assert k not in copied.attrs else: - assert sorted(original.attrs.items()) == sorted(copied.attrs.items()) + if dest_h5py and 'filters' in original.attrs: + # special case in v3 (storing filters metadata under attributes) + # we explicitly do not copy this info over to HDF5 + original_attrs = original.attrs.asdict().copy() + original_attrs.pop('filters') + else: + original_attrs = original.attrs + assert sorted(original_attrs.items()) == sorted(copied.attrs.items()) def check_copied_group(original, copied, without_attrs=False, expect_props=None, @@ -473,10 +650,32 @@ def test_copy_all(): dry_run=False, ) + assert 'subgroup' in destination_group assert destination_group.attrs["info"] == "group attrs" assert destination_group.subgroup.attrs["info"] == "sub attrs" +def test_copy_all_v3(): + """ + https://github.com/zarr-developers/zarr-python/issues/269 + + copy_all used to not copy attributes as `.keys()` + + """ + original_group = zarr.group(store=MemoryStoreV3(), path='group1', overwrite=True) + original_group.create_group("subgroup") + + destination_group = zarr.group(store=MemoryStoreV3(), path='group2', overwrite=True) + + # copy from memory to directory store + copy_all( + original_group, + destination_group, + dry_run=False, + ) + assert 'subgroup' in destination_group + + class TestCopy: @pytest.fixture(params=[False, True], ids=['zarr', 'hdf5']) def source(self, request, tmpdir): @@ -719,3 +918,88 @@ def test_logging(self, source, dest, tmpdir): # bad option with pytest.raises(TypeError): copy(source['foo'], dest, dry_run=True, log=True) + + +class TestCopyV3(TestCopy): + + @pytest.fixture(params=['zarr', 'hdf5']) + def source(self, request, tmpdir): + def prep_source(source): + foo = source.create_group('foo') + foo.attrs['experiment'] = 'weird science' + baz = foo.create_dataset('bar/baz', data=np.arange(100), chunks=(50,)) + baz.attrs['units'] = 'metres' + if request.param == 'hdf5': + extra_kws = dict(compression='gzip', compression_opts=3, fillvalue=84, + shuffle=True, fletcher32=True) + else: + extra_kws = dict(compressor=Zlib(3), order='F', fill_value=42, filters=[Adler32()]) + source.create_dataset('spam', data=np.arange(100, 200).reshape(20, 5), + chunks=(10, 2), dtype='i2', **extra_kws) + return source + + if request.param == 'hdf5': + h5py = pytest.importorskip('h5py') + fn = tmpdir.join('source.h5') + with h5py.File(str(fn), mode='w') as h5f: + yield prep_source(h5f) + elif request.param == 'zarr': + yield prep_source(group(path='group1', zarr_version=3)) + + # Test with various destination StoreV3 types as TestCopyV3 covers rmdir + destinations = ['hdf5', 'zarr', 'zarr_kvstore', 'zarr_directorystore', 'zarr_sqlitestore'] + if have_fsspec: + destinations += ['zarr_fsstore'] + + @pytest.fixture(params=destinations) + def dest(self, request, tmpdir): + if request.param == 'hdf5': + h5py = pytest.importorskip('h5py') + fn = tmpdir.join('dest.h5') + with h5py.File(str(fn), mode='w') as h5f: + yield h5f + elif request.param == 'zarr': + yield group(path='group2', zarr_version=3) + elif request.param == 'zarr_kvstore': + store = KVStoreV3(dict()) + yield group(store, path='group2', zarr_version=3) + elif request.param == 'zarr_fsstore': + fn = tmpdir.join('dest.zr3') + store = FSStoreV3(str(fn), auto_mkdir=True) + yield group(store, path='group2', zarr_version=3) + elif request.param == 'zarr_directorystore': + fn = tmpdir.join('dest.zr3') + store = DirectoryStoreV3(str(fn)) + yield group(store, path='group2', zarr_version=3) + elif request.param == 'zarr_sqlitestore': + fn = tmpdir.join('dest.db') + store = SQLiteStoreV3(str(fn)) + yield group(store, path='group2', zarr_version=3) + + def test_copy_array_create_options(self, source, dest): + dest_h5py = dest.__module__.startswith('h5py.') + + # copy array, provide creation options + compressor = Zlib(9) + create_kws = dict(chunks=(10,)) + if dest_h5py: + create_kws.update(compression='gzip', compression_opts=9, + shuffle=True, fletcher32=True, fillvalue=42) + else: + # v3 case has no filters argument in zarr create_kws + create_kws.update(compressor=compressor, fill_value=42, order='F') + copy(source['foo/bar/baz'], dest, without_attrs=True, **create_kws) + check_copied_array(source['foo/bar/baz'], dest['baz'], + without_attrs=True, expect_props=create_kws) + + def test_copy_group_no_name(self, source, dest): + if source.__module__.startswith('h5py'): + with pytest.raises(TypeError): + copy(source, dest) + else: + # For v3, dest.name will be inferred from source.name + copy(source, dest) + check_copied_group(source, dest[source.name.lstrip('/')]) + + copy(source, dest, name='root') + check_copied_group(source, dest['root']) diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 7423132887..08bda94ba2 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -17,7 +17,13 @@ from numpy.testing import assert_array_almost_equal, assert_array_equal from pkg_resources import parse_version +from zarr._storage.store import ( + _prefix_to_array_key, + _prefix_to_attrs_key, + _prefix_to_group_key +) from zarr.core import Array +from zarr.errors import ArrayNotFoundError, ContainsGroupError from zarr.meta import json_loads from zarr.n5 import N5Store, N5FSStore, n5_keywords from zarr.storage import ( @@ -30,10 +36,21 @@ LRUStoreCache, NestedDirectoryStore, SQLiteStore, + ABSStoreV3, + DBMStoreV3, + DirectoryStoreV3, + FSStoreV3, + KVStoreV3, + LMDBStoreV3, + LRUStoreCacheV3, + SQLiteStoreV3, + StoreV3, atexit_rmglob, atexit_rmtree, + data_root, init_array, init_group, + meta_root, ) from zarr.util import buffer_size from zarr.tests.util import abs_container, skip_test_env_var, have_fsspec @@ -43,6 +60,8 @@ class TestArray(unittest.TestCase): + version = 2 + def test_array_init(self): # normal initialization @@ -528,6 +547,8 @@ def test_setitem_data_not_shared(self): z.store.close() def expected(self): + # tests for array without path will not be run for v3 stores + assert self.version == 2 return [ "063b02ff8d9d3bab6da932ad5828b506ef0a6578", "f97b84dc9ffac807415f750100108764e837bb82", @@ -1111,6 +1132,19 @@ def test_dtypes(self): assert_array_equal(a, z[:]) z.store.close() + # unicode and bytestring dtypes + for dtype in ['S4', 'S6', 'U5', 'U5']: + n = 10 + z = self.create_array(shape=n, chunks=3, dtype=dtype) + assert z.dtype == np.dtype(dtype) + if dtype.startswith('S'): + a = np.asarray([b'name'] * n, dtype=dtype) + else: + a = np.asarray(['§Æ¥¿é'] * n, dtype=dtype) + z[:] = a + np.all(a == z[:]) + z.store.close() + # check that datetime generic units are not allowed with pytest.raises(ValueError): self.create_array(shape=100, dtype='M8') @@ -1180,7 +1214,6 @@ def test_object_arrays(self): def test_object_arrays_vlen_text(self): data = np.array(greetings * 1000, dtype=object) - z = self.create_array(shape=data.shape, dtype=object, object_codec=VLenUTF8()) z[0] = 'foo' assert z[0] == 'foo' @@ -1474,11 +1507,17 @@ def test_attributes(self): a.attrs['foo'] = 'bar' assert a.attrs.key in a.store attrs = json_loads(a.store[a.attrs.key]) + if self.version > 2: + # in v3, attributes are in a sub-dictionary of the metadata + attrs = attrs['attributes'] assert 'foo' in attrs and attrs['foo'] == 'bar' a.attrs['bar'] = 'foo' assert a.attrs.key in a.store attrs = json_loads(a.store[a.attrs.key]) + if self.version > 2: + # in v3, attributes are in a sub-dictionary of the metadata + attrs = attrs['attributes'] assert 'foo' in attrs and attrs['foo'] == 'bar' assert 'bar' in attrs and attrs['bar'] == 'foo' a.store.close() @@ -1508,28 +1547,14 @@ def create_array(read_only=False, **kwargs): def test_nchunks_initialized(self): pass - def test_hexdigest(self): - # Check basic 1-D array - z = self.create_array(shape=(1050,), chunks=100, dtype=' 2 and g1.store.is_erasable(): + arr_path = g1.path + '/arr1' + sfx = _get_metadata_suffix(g1.store) + array_meta_file = meta_root + arr_path + '.array' + sfx + assert array_meta_file in g1.store + group_meta_file = meta_root + g2.path + '.group' + sfx + assert group_meta_file in g1.store + + # rmdir on the array path should also remove the metadata file + g1.store.rmdir(arr_path) + assert array_meta_file not in g1.store + # rmdir on the group path should also remove its metadata file + g1.store.rmdir(g2.path) + assert group_meta_file not in g1.store + + def _dataset_path(self, group, path): + path = path.rstrip('/') + absolute = path.startswith('/') + if absolute: + dataset_path = path + else: + dataset_path = '/'.join([group.path, path]) + dataset_path = dataset_path.lstrip('/') + dataset_name = '/' + dataset_path + return dataset_path, dataset_name + def test_create_dataset(self): g = self.create_group() # create as immediate child - d1 = g.create_dataset('foo', shape=1000, chunks=100) + dpath = 'foo' + d1 = g.create_dataset(dpath, shape=1000, chunks=100) + path, name = self._dataset_path(g, dpath) assert isinstance(d1, Array) assert (1000,) == d1.shape assert (100,) == d1.chunks - assert 'foo' == d1.path - assert '/foo' == d1.name + assert path == d1.path + assert name == d1.name assert g.store is d1.store # create as descendant - d2 = g.create_dataset('/a/b/c/', shape=2000, chunks=200, dtype='i1', + dpath = '/a/b/c/' + d2 = g.create_dataset(dpath, shape=2000, chunks=200, dtype='i1', compression='zlib', compression_opts=9, fill_value=42, order='F') + path, name = self._dataset_path(g, dpath) assert isinstance(d2, Array) assert (2000,) == d2.shape assert (200,) == d2.chunks @@ -234,20 +312,22 @@ def test_create_dataset(self): assert 9 == d2.compressor.level assert 42 == d2.fill_value assert 'F' == d2.order - assert 'a/b/c' == d2.path - assert '/a/b/c' == d2.name + assert path == d2.path + assert name == d2.name assert g.store is d2.store # create with data data = np.arange(3000, dtype='u2') - d3 = g.create_dataset('bar', data=data, chunks=300) + dpath = 'bar' + d3 = g.create_dataset(dpath, data=data, chunks=300) + path, name = self._dataset_path(g, dpath) assert isinstance(d3, Array) assert (3000,) == d3.shape assert (300,) == d3.chunks assert np.dtype('u2') == d3.dtype assert_array_equal(data, d3[:]) - assert 'bar' == d3.path - assert '/bar' == d3.name + assert path == d3.path + assert name == d3.name assert g.store is d3.store # compression arguments handling follows... @@ -290,25 +370,27 @@ def test_require_dataset(self): g = self.create_group() # create - d1 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + dpath = 'foo' + d1 = g.require_dataset(dpath, shape=1000, chunks=100, dtype='f4') d1[:] = np.arange(1000) + path, name = self._dataset_path(g, dpath) assert isinstance(d1, Array) assert (1000,) == d1.shape assert (100,) == d1.chunks assert np.dtype('f4') == d1.dtype - assert 'foo' == d1.path - assert '/foo' == d1.name + assert path == d1.path + assert name == d1.name assert g.store is d1.store assert_array_equal(np.arange(1000), d1[:]) # require - d2 = g.require_dataset('foo', shape=1000, chunks=100, dtype='f4') + d2 = g.require_dataset(dpath, shape=1000, chunks=100, dtype='f4') assert isinstance(d2, Array) assert (1000,) == d2.shape assert (100,) == d2.chunks assert np.dtype('f4') == d2.dtype - assert 'foo' == d2.path - assert '/foo' == d2.name + assert path == d2.path + assert name == d2.name assert g.store is d2.store assert_array_equal(np.arange(1000), d2[:]) assert d1 == d2 @@ -419,7 +501,12 @@ def test_getitem_contains_iterators(self): # setup g1 = self.create_group() g2 = g1.create_group('foo/bar') - d1 = g2.create_dataset('/a/b/c', shape=1000, chunks=100) + if g1._version == 2: + d1 = g2.create_dataset('/a/b/c', shape=1000, chunks=100) + else: + # v3: cannot create a dataset at the root by starting with / + # instead, need to create the dataset on g1 directly + d1 = g1.create_dataset('a/b/c', shape=1000, chunks=100) d1[:] = np.arange(1000) d2 = g1.create_dataset('foo/baz', shape=3000, chunks=300) d2[:] = np.arange(3000) @@ -428,7 +515,13 @@ def test_getitem_contains_iterators(self): assert isinstance(g1['foo'], Group) assert isinstance(g1['foo']['bar'], Group) assert isinstance(g1['foo/bar'], Group) - assert isinstance(g1['/foo/bar/'], Group) + if g1._version == 2: + assert isinstance(g1['/foo/bar/'], Group) + else: + # start or end with / raises KeyError + # TODO: should we fix allow stripping of these on v3? + with pytest.raises(KeyError): + assert isinstance(g1['/foo/bar/'], Group) assert isinstance(g1['foo/baz'], Array) assert g2 == g1['foo/bar'] assert g1['foo']['bar'] == g1['foo/bar'] @@ -454,7 +547,9 @@ def test_getitem_contains_iterators(self): assert 'baz' not in g1 assert 'a/b/c/d' not in g1 assert 'a/z' not in g1 - assert 'quux' not in g1['foo'] + if g1._version == 2: + # TODO: handle implicit group for v3 spec + assert 'quux' not in g1['foo'] # test key errors with pytest.raises(KeyError): @@ -470,12 +565,19 @@ def test_getitem_contains_iterators(self): assert 1 == len(g1['a/b']) # test __iter__, keys() - # currently assumes sorted by key - assert ['a', 'foo'] == list(g1) - assert ['a', 'foo'] == list(g1.keys()) - assert ['bar', 'baz'] == list(g1['foo']) - assert ['bar', 'baz'] == list(g1['foo'].keys()) + if g1._version == 2: + # currently assumes sorted by key + assert ['a', 'foo'] == list(g1) + assert ['a', 'foo'] == list(g1.keys()) + assert ['bar', 'baz'] == list(g1['foo']) + assert ['bar', 'baz'] == list(g1['foo'].keys()) + else: + # v3 is not necessarily sorted by key + assert ['a', 'foo'] == sorted(list(g1)) + assert ['a', 'foo'] == sorted(list(g1.keys())) + assert ['bar', 'baz'] == sorted(list(g1['foo'])) + assert ['bar', 'baz'] == sorted(list(g1['foo'].keys())) assert [] == sorted(g1['foo/bar']) assert [] == sorted(g1['foo/bar'].keys()) @@ -484,6 +586,9 @@ def test_getitem_contains_iterators(self): items = list(g1.items()) values = list(g1.values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert 'a' == items[0][0] assert g1['a'] == items[0][1] assert g1['a'] == values[0] @@ -493,6 +598,9 @@ def test_getitem_contains_iterators(self): items = list(g1['foo'].items()) values = list(g1['foo'].values()) + if g1._version == 3: + # v3 are not automatically sorted by key + items, values = zip(*sorted(zip(items, values), key=lambda x: x[0])) assert 'bar' == items[0][0] assert g1['foo']['bar'] == items[0][1] assert g1['foo']['bar'] == values[0] @@ -501,11 +609,16 @@ def test_getitem_contains_iterators(self): assert g1['foo']['baz'] == values[1] # test array_keys(), arrays(), group_keys(), groups() - # currently assumes sorted by key - assert ['a', 'foo'] == list(g1.group_keys()) groups = list(g1.groups()) arrays = list(g1.arrays()) + if g1._version == 2: + # currently assumes sorted by key + assert ['a', 'foo'] == list(g1.group_keys()) + else: + assert ['a', 'foo'] == sorted(list(g1.group_keys())) + groups = sorted(groups) + arrays = sorted(arrays) assert 'a' == groups[0][0] assert g1['a'] == groups[0][1] assert 'foo' == groups[1][0] @@ -517,6 +630,9 @@ def test_getitem_contains_iterators(self): assert ['baz'] == list(g1['foo'].array_keys()) groups = list(g1['foo'].groups()) arrays = list(g1['foo'].arrays()) + if g1._version == 3: + groups = sorted(groups) + arrays = sorted(arrays) assert 'bar' == groups[0][0] assert g1['foo']['bar'] == groups[0][1] assert 'baz' == arrays[0][0] @@ -537,21 +653,27 @@ def visitor4(name, obj): del items[:] g1.visitvalues(visitor2) - assert [ + expected_items = [ "a", "a/b", "a/b/c", "foo", "foo/bar", "foo/baz", - ] == items + ] + if g1._version == 3: + expected_items = [g1.path + '/' + i for i in expected_items] + assert expected_items == items del items[:] g1["foo"].visitvalues(visitor2) - assert [ + expected_items = [ "foo/bar", "foo/baz", - ] == items + ] + if g1._version == 3: + expected_items = [g1.path + '/' + i for i in expected_items] + assert expected_items == items del items[:] g1.visit(visitor3) @@ -627,6 +749,9 @@ def visitor0(val, *args): # noinspection PyUnusedLocal def visitor1(val, *args): name = getattr(val, "path", val) + if name.startswith('group/'): + # strip the group path for v3 + name = name[6:] if name == "a/b/c": return True @@ -762,9 +887,13 @@ def test_move(self): g2.move("bar", "/bar") assert "foo2" in g assert "foo2/bar" not in g - assert "bar" in g + if g2._version == 2: + # TODO: how to access element created outside of group.path in v3? + assert "bar" in g assert isinstance(g["foo2"], Group) - assert_array_equal(data, g["bar"]) + if g2._version == 2: + # TODO: how to access element created outside of group.path in v3? + assert_array_equal(data, g["bar"]) with pytest.raises(ValueError): g2.move("bar", "bar2") @@ -841,6 +970,9 @@ def test_paths(self): g1 = self.create_group() g2 = g1.create_group('foo/bar') + if g1._version == 3: + pytest.skip("TODO: update class for v3") + assert g1 == g1['/'] assert g1 == g1['//'] assert g1 == g1['///'] @@ -893,7 +1025,9 @@ def test_pickle(self): assert name == g2.name assert n == len(g2) assert keys == list(g2) - assert isinstance(g2['foo'], Group) + if g2._version == 2: + # TODO: handle implicit group for v3 + assert isinstance(g2['foo'], Group) assert isinstance(g2['foo/bar'], Array) g2.store.close() @@ -921,6 +1055,57 @@ def test_group_init_from_dict(chunk_dict): assert chunk_store is not g.chunk_store +# noinspection PyStatementEffect +class TestGroupV3(TestGroup, unittest.TestCase): + + @staticmethod + def create_store(): + # can be overridden in sub-classes + return KVStoreV3(dict()), None + + def create_group(self, store=None, path='group', read_only=False, + chunk_store=None, synchronizer=None): + # can be overridden in sub-classes + if store is None: + store, chunk_store = self.create_store() + init_group(store, path=path, chunk_store=chunk_store) + g = Group(store, path=path, read_only=read_only, + chunk_store=chunk_store, synchronizer=synchronizer) + return g + + def test_group_init_1(self): + store, chunk_store = self.create_store() + g = self.create_group(store, chunk_store=chunk_store) + assert store is g.store + if chunk_store is None: + assert store is g.chunk_store + else: + assert chunk_store is g.chunk_store + assert not g.read_only + # different path/name in v3 case + assert 'group' == g.path + assert '/group' == g.name + assert 'group' == g.basename + + assert isinstance(g.attrs, Attributes) + g.attrs['foo'] = 'bar' + assert g.attrs['foo'] == 'bar' + + assert isinstance(g.info, InfoReporter) + assert isinstance(repr(g.info), str) + assert isinstance(g.info._repr_html_(), str) + store.close() + + def test_group_init_errors_2(self): + store, chunk_store = self.create_store() + path = 'tmp' + init_array(store, path=path, shape=1000, chunks=100, chunk_store=chunk_store) + # array blocks group + with pytest.raises(ValueError): + Group(store, path=path, chunk_store=chunk_store) + store.close() + + class TestGroupWithMemoryStore(TestGroup): @staticmethod @@ -928,6 +1113,14 @@ def create_store(): return MemoryStore(), None +# TODO: fix MemoryStoreV3 _get_parent, etc. +# # noinspection PyStatementEffect +# class TestGroupV3WithMemoryStore(TestGroupWithMemoryStore, TestGroupV3): + +# @staticmethod +# def create_store(): +# return MemoryStoreV3(), None + class TestGroupWithDirectoryStore(TestGroup): @staticmethod @@ -938,6 +1131,16 @@ def create_store(): return store, None +class TestGroupV3WithDirectoryStore(TestGroupWithDirectoryStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path) + return store, None + + @skip_test_env_var("ZARR_TEST_ABS") class TestGroupWithABSStore(TestGroup): @@ -954,6 +1157,22 @@ def test_pickle(self): super().test_pickle() +@skip_test_env_var("ZARR_TEST_ABS") +class TestGroupWithABSStoreV3(TestGroupV3): + + @staticmethod + def create_store(): + container_client = abs_container() + store = ABSStoreV3(client=container_client) + store.rmdir() + return store, None + + @pytest.mark.skipif(sys.version_info < (3, 7), reason="attr not serializable in py36") + def test_pickle(self): + # internal attribute on ContainerClient isn't serializable for py36 and earlier + super().test_pickle() + + class TestGroupWithNestedDirectoryStore(TestGroup): @staticmethod @@ -982,10 +1201,45 @@ def test_round_trip_nd(self): f = open_group(store, mode='w') f.create_dataset(name, data=data, chunks=(5, 5, 5), compressor=None) + assert name in f h = open_group(store, mode='r') np.testing.assert_array_equal(h[name][:], data) +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupV3WithFSStore(TestGroupWithFSStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path) + return store, None + + def test_round_trip_nd(self): + data = np.arange(1000).reshape(10, 10, 10) + name = 'raw' + + store, _ = self.create_store() + f = open_group(store, path='group', mode='w') + f.create_dataset(name, data=data, chunks=(5, 5, 5), + compressor=None) + h = open_group(store, path='group', mode='r') + np.testing.assert_array_equal(h[name][:], data) + + f = open_group(store, path='group2', mode='w') + + data_size = data.nbytes + group_meta_size = buffer_size(store[meta_root + 'group.group.json']) + group2_meta_size = buffer_size(store[meta_root + 'group2.group.json']) + array_meta_size = buffer_size(store[meta_root + 'group/raw.array.json']) + assert store.getsize() == data_size + group_meta_size + group2_meta_size + array_meta_size + # added case with path to complete coverage + assert store.getsize('group') == data_size + group_meta_size + array_meta_size + assert store.getsize('group2') == group2_meta_size + assert store.getsize('group/raw') == data_size + array_meta_size + + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestGroupWithNestedFSStore(TestGroupWithFSStore): @@ -1009,6 +1263,29 @@ def test_inconsistent_dimension_separator(self): compressor=None, dimension_separator='.') +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestGroupV3WithNestedFSStore(TestGroupV3WithFSStore): + + @staticmethod + def create_store(): + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = FSStoreV3(path, key_separator='/', auto_mkdir=True) + return store, None + + def test_inconsistent_dimension_separator(self): + data = np.arange(1000).reshape(10, 10, 10) + name = 'raw' + + store, _ = self.create_store() + f = open_group(store, path='group', mode='w') + + # cannot specify dimension_separator that conflicts with the store + with pytest.raises(ValueError): + f.create_dataset(name, data=data, chunks=(5, 5, 5), + compressor=None, dimension_separator='.') + + class TestGroupWithZipStore(TestGroup): @staticmethod @@ -1036,6 +1313,16 @@ def test_move(self): pass +class TestGroupV3WithZipStore(TestGroupWithZipStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStoreV3(path) + return store, None + + class TestGroupWithDBMStore(TestGroup): @staticmethod @@ -1046,6 +1333,16 @@ def create_store(): return store, None +class TestGroupV3WithDBMStore(TestGroupWithDBMStore, TestGroupV3): + + @staticmethod + def create_store(): + path = tempfile.mktemp(suffix='.anydbm') + atexit.register(atexit_rmglob, path + '*') + store = DBMStoreV3(path, flag='n') + return store, None + + class TestGroupWithDBMStoreBerkeleyDB(TestGroup): @staticmethod @@ -1057,6 +1354,17 @@ def create_store(): return store, None +class TestGroupV3WithDBMStoreBerkeleyDB(TestGroupWithDBMStoreBerkeleyDB, TestGroupV3): + + @staticmethod + def create_store(): + bsddb3 = pytest.importorskip("bsddb3") + path = tempfile.mktemp(suffix='.dbm') + atexit.register(os.remove, path) + store = DBMStoreV3(path, flag='n', open=bsddb3.btopen) + return store, None + + class TestGroupWithLMDBStore(TestGroup): @staticmethod @@ -1068,6 +1376,17 @@ def create_store(): return store, None +class TestGroupV3WithLMDBStore(TestGroupWithLMDBStore, TestGroupV3): + + @staticmethod + def create_store(): + pytest.importorskip("lmdb") + path = tempfile.mktemp(suffix='.lmdb') + atexit.register(atexit_rmtree, path) + store = LMDBStoreV3(path) + return store, None + + class TestGroupWithSQLiteStore(TestGroup): def create_store(self): @@ -1078,6 +1397,16 @@ def create_store(self): return store, None +class TestGroupV3WithSQLiteStore(TestGroupWithSQLiteStore, TestGroupV3): + + def create_store(self): + pytest.importorskip("sqlite3") + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path) + return store, None + + class TestGroupWithChunkStore(TestGroup): @staticmethod @@ -1109,6 +1438,41 @@ def test_chunk_store(self): assert expect == actual +class TestGroupV3WithChunkStore(TestGroupWithChunkStore, TestGroupV3): + + @staticmethod + def create_store(): + return KVStoreV3(dict()), KVStoreV3(dict()) + + def test_chunk_store(self): + # setup + store, chunk_store = self.create_store() + path = 'group1' + g = self.create_group(store, path=path, chunk_store=chunk_store) + + # check attributes + assert store is g.store + assert chunk_store is g.chunk_store + + # create array + a = g.zeros('foo', shape=100, chunks=10) + assert store is a.store + assert chunk_store is a.chunk_store + a[:] = np.arange(100) + assert_array_equal(np.arange(100), a[:]) + + # check store keys + group_key = meta_root + path + '.group.json' + array_key = meta_root + path + '/foo' + '.array.json' + expect = sorted([group_key, array_key, 'zarr.json']) + actual = sorted(store.keys()) + assert expect == actual + expect = [data_root + path + '/foo/c' + str(i) for i in range(10)] + expect += ['zarr.json'] + actual = sorted(chunk_store.keys()) + assert expect == actual + + class TestGroupWithStoreCache(TestGroup): @staticmethod @@ -1117,44 +1481,78 @@ def create_store(): return store, None -def test_group(): +class TestGroupV3WithStoreCache(TestGroupWithStoreCache, TestGroupV3): + + @staticmethod + def create_store(): + store = LRUStoreCacheV3(dict(), max_size=None) + return store, None + + +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group(zarr_version): # test the group() convenience function # basic usage - g = group() + if zarr_version == 2: + g = group() + assert '' == g.path + assert '/' == g.name + else: + g = group(path='group1', zarr_version=zarr_version) + with pytest.raises(ValueError): + # must supply path for v3 groups + group(zarr_version=3) + assert 'group1' == g.path + assert '/group1' == g.name assert isinstance(g, Group) - assert '' == g.path - assert '/' == g.name # usage with custom store - store = KVStore(dict()) - g = group(store=store) + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = 'foo' + g = group(store=store, path=path) assert isinstance(g, Group) assert store is g.store # overwrite behaviour - store = KVStore(dict()) - init_array(store, shape=100, chunks=10) + if zarr_version == 2: + store = KVStore(dict()) + path = None + else: + store = KVStoreV3(dict()) + path = 'foo' + init_array(store, path=path, shape=100, chunks=10) with pytest.raises(ValueError): - group(store) - g = group(store, overwrite=True) + group(store, path=path) + g = group(store, path=path, overwrite=True) assert isinstance(g, Group) assert store is g.store -def test_open_group(): +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_open_group(zarr_version): # test the open_group() convenience function store = 'data/group.zarr' + expected_store_type = DirectoryStore if zarr_version == 2 else DirectoryStoreV3 + # mode == 'w' - g = open_group(store, mode='w') + path = None if zarr_version == 2 else 'group1' + g = open_group(store, path=path, mode='w', zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) + # TODO: update the r, r+ test case here for zarr_version == 3 after + # open_array has StoreV3 support + # mode in 'r', 'r+' open_array('data/array.zarr', shape=100, chunks=10, mode='w') for mode in 'r', 'r+': @@ -1175,37 +1573,40 @@ def test_open_group(): # mode == 'a' shutil.rmtree(store) - g = open_group(store, mode='a') + g = open_group(store, path=path, mode='a', zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) with pytest.raises(ValueError): - open_group('data/array.zarr', mode='a') + open_group('data/array.zarr', mode='a', zarr_version=zarr_version) # mode in 'w-', 'x' for mode in 'w-', 'x': shutil.rmtree(store) - g = open_group(store, mode=mode) + g = open_group(store, path=path, mode=mode, zarr_version=zarr_version) assert isinstance(g, Group) - assert isinstance(g.store, DirectoryStore) + assert isinstance(g.store, expected_store_type) assert 0 == len(g) g.create_groups('foo', 'bar') assert 2 == len(g) with pytest.raises(ValueError): - open_group(store, mode=mode) - with pytest.raises(ValueError): - open_group('data/array.zarr', mode=mode) + open_group(store, path=path, mode=mode, zarr_version=zarr_version) + if zarr_version == 2: + with pytest.raises(ValueError): + open_group('data/array.zarr', mode=mode) # open with path - g = open_group(store, path='foo/bar') + g = open_group(store, path='foo/bar', zarr_version=zarr_version) assert isinstance(g, Group) assert 'foo/bar' == g.path -def test_group_completions(): - g = group() +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group_completions(zarr_version): + path = None if zarr_version == 2 else 'group1' + g = group(path=path, zarr_version=zarr_version) d = dir(g) assert 'foo' not in d assert 'bar' not in d @@ -1233,8 +1634,10 @@ def test_group_completions(): assert '456' not in d # not valid identifier -def test_group_key_completions(): - g = group() +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_group_key_completions(zarr_version): + path = None if zarr_version == 2 else 'group1' + g = group(path=path, zarr_version=zarr_version) d = dir(g) # noinspection PyProtectedMember k = g._ipython_key_completions_() @@ -1268,7 +1671,12 @@ def test_group_key_completions(): g.zeros('yyy', shape=100) g.zeros('zzz', shape=100) g.zeros('456', shape=100) - g.zeros('asdf;', shape=100) + if zarr_version == 2: + g.zeros('asdf;', shape=100) + else: + # cannot have ; in key name for v3 + with pytest.raises(ValueError): + g.zeros('asdf;', shape=100) d = dir(g) # noinspection PyProtectedMember @@ -1283,7 +1691,8 @@ def test_group_key_completions(): assert 'zzz' in d assert '123' not in d # not valid identifier assert '456' not in d # not valid identifier - assert 'asdf;' not in d # not valid identifier + if zarr_version == 2: + assert 'asdf;' not in d # not valid identifier assert 'foo' in k assert 'bar' in k @@ -1294,7 +1703,8 @@ def test_group_key_completions(): assert 'zzz' in k assert '123' in k assert '456' in k - assert 'asdf;' in k + if zarr_version == 2: + assert 'asdf;' in k def _check_tree(g, expect_bytes, expect_text): @@ -1308,9 +1718,11 @@ def _check_tree(g, expect_bytes, expect_text): isinstance(widget, ipytree.Tree) -def test_tree(): +@pytest.mark.parametrize('zarr_version', [2, 3]) +def test_tree(zarr_version): # setup - g1 = group() + path = None if zarr_version == 2 else 'group1' + g1 = group(path=path, zarr_version=zarr_version) g2 = g1.create_group('foo') g3 = g1.create_group('bar') g3.create_group('baz') @@ -1318,20 +1730,38 @@ def test_tree(): g5.create_dataset('baz', shape=100, chunks=10) # test root group - expect_bytes = textwrap.dedent("""\ - / - +-- bar - | +-- baz - | +-- quux - | +-- baz (100,) float64 - +-- foo""").encode() - expect_text = textwrap.dedent("""\ - / - ├── bar - │ ├── baz - │ └── quux - │ └── baz (100,) float64 - └── foo""") + if zarr_version == 2: + expect_bytes = textwrap.dedent("""\ + / + +-- bar + | +-- baz + | +-- quux + | +-- baz (100,) float64 + +-- foo""").encode() + expect_text = textwrap.dedent("""\ + / + ├── bar + │ ├── baz + │ └── quux + │ └── baz (100,) float64 + └── foo""") + else: + # Almost the same as for v2, but has a path name and the + # subgroups are not necessarily sorted alphabetically. + expect_bytes = textwrap.dedent("""\ + group1 + +-- foo + +-- bar + +-- baz + +-- quux + +-- baz (100,) float64""").encode() + expect_text = textwrap.dedent("""\ + group1 + ├── foo + └── bar + ├── baz + └── quux + └── baz (100,) float64""") _check_tree(g1, expect_bytes, expect_text) # test different group @@ -1353,3 +1783,36 @@ def test_tree(): └── quux └── baz (100,) float64""") _check_tree(g3, expect_bytes, expect_text) + + +def test_group_mismatched_store_versions(): + store_v3 = KVStoreV3(dict()) + store_v2 = KVStore(dict()) + + # separate chunk store + chunk_store_v2 = KVStore(dict()) + chunk_store_v3 = KVStoreV3(dict()) + + init_group(store_v2, path='group1', chunk_store=chunk_store_v2) + init_group(store_v3, path='group1', chunk_store=chunk_store_v3) + + g1_v3 = Group(store_v3, path='group1', read_only=True, chunk_store=chunk_store_v3) + assert isinstance(g1_v3._store, KVStoreV3) + g1_v2 = Group(store_v2, path='group1', read_only=True, chunk_store=chunk_store_v2) + assert isinstance(g1_v2._store, KVStore) + + # store and chunk_store must have the same zarr protocol version + with pytest.raises(ValueError): + Group(store_v3, path='group1', read_only=False, chunk_store=chunk_store_v2) + with pytest.raises(ValueError): + Group(store_v2, path='group1', read_only=False, chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + open_group(store_v2, path='group1', chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + open_group(store_v3, path='group1', chunk_store=chunk_store_v2) + + # raises Value if read_only and path is not a pre-existing group + with pytest.raises(ValueError): + Group(store_v3, path='group2', read_only=True, chunk_store=chunk_store_v3) + with pytest.raises(ValueError): + Group(store_v3, path='group2', read_only=True, chunk_store=chunk_store_v3) diff --git a/zarr/tests/test_indexing.py b/zarr/tests/test_indexing.py index 74f0c9f7de..524d335c9f 100644 --- a/zarr/tests/test_indexing.py +++ b/zarr/tests/test_indexing.py @@ -1442,7 +1442,7 @@ def test_slice_selection_uints(): arr = np.arange(24).reshape((4, 6)) idx = np.uint64(3) slice_sel = make_slice_selection((idx,)) - assert arr[slice_sel].shape == (1, 6) + assert arr[tuple(slice_sel)].shape == (1, 6) def test_numpy_int_indexing(): diff --git a/zarr/tests/test_info.py b/zarr/tests/test_info.py index 361490c0a8..434d19d1f7 100644 --- a/zarr/tests/test_info.py +++ b/zarr/tests/test_info.py @@ -1,15 +1,18 @@ import numcodecs +import pytest import zarr +from zarr.util import InfoReporter -def test_info(): +@pytest.mark.parametrize('array_size', [10, 15000]) +def test_info(array_size): # setup g = zarr.group(store=dict(), chunk_store=dict(), synchronizer=zarr.ThreadSynchronizer()) g.create_group('foo') - z = g.zeros('bar', shape=10, filters=[numcodecs.Adler32()]) + z = g.zeros('bar', shape=array_size, filters=[numcodecs.Adler32()]) # test group info items = g.info_items() @@ -20,6 +23,10 @@ def test_info(): ]) assert expected_keys == keys + # can also get a string representation of info via the info attribute + assert isinstance(g.info, InfoReporter) + assert "Type" in repr(g.info) + # test array info items = z.info_items() keys = sorted([k for k, _ in items]) @@ -29,3 +36,7 @@ def test_info(): 'No. bytes stored', 'Storage ratio', 'Chunks initialized', 'Name' ]) assert expected_keys == keys + + # can also get a string representation of info via the info attribute + assert isinstance(z.info, InfoReporter) + assert "Type" in repr(z.info) diff --git a/zarr/tests/test_meta.py b/zarr/tests/test_meta.py index 5469921110..8acd634a13 100644 --- a/zarr/tests/test_meta.py +++ b/zarr/tests/test_meta.py @@ -1,4 +1,5 @@ import base64 +import copy import json import numpy as np @@ -8,7 +9,10 @@ from zarr.errors import MetadataError from zarr.meta import (ZARR_FORMAT, decode_array_metadata, decode_dtype, decode_group_metadata, encode_array_metadata, - encode_dtype, encode_fill_value, decode_fill_value) + encode_dtype, encode_fill_value, decode_fill_value, + get_extended_dtype_info, _v3_complex_types, + _v3_datetime_types, _default_entry_point_metadata_v3, + Metadata3) from zarr.util import normalize_dtype, normalize_fill_value @@ -260,6 +264,56 @@ def test_encode_decode_array_dtype_shape(): assert meta_dec['filters'] is None +def test_encode_decode_array_dtype_shape_v3(): + + meta = dict( + shape=(100,), + chunk_grid=dict(type='regular', + chunk_shape=(10,), + separator=('/')), + data_type=np.dtype('(10, 10)U4', '> 16) assert perm == '0o644' - info = z.getinfo('baz/') + info = z.getinfo(baz_key) perm = oct(info.external_attr >> 16) # only for posix platforms if os.name == 'posix': - assert perm == '0o40775' + if self.version == 2: + assert perm == '0o40775' + else: + # baz/ on v2, but baz on v3, so not a directory + assert perm == '0o644' z.close() def test_store_and_retrieve_ndarray(self): @@ -1609,8 +1783,8 @@ def create_store(self, dimension_separator=None): def test_context_manager(self): with self.create_store() as store: - store['foo'] = b'bar' - store['baz'] = b'qux' + store[self.root + 'foo'] = b'bar' + store[self.root + 'baz'] = b'qux' assert 2 == len(store) @@ -1669,8 +1843,8 @@ def create_store(self, **kwargs): def test_context_manager(self): with self.create_store() as store: - store['foo'] = b'bar' - store['baz'] = b'qux' + store[self.root + 'foo'] = b'bar' + store[self.root + 'baz'] = b'qux' assert 2 == len(store) @@ -1704,8 +1878,8 @@ def test_pickle(self): # setup store store = self.create_store() - store['foo'] = b'bar' - store['baz'] = b'quux' + store[self.root + 'foo'] = b'bar' + store[self.root + 'baz'] = b'quux' # round-trip through pickle with pytest.raises(PicklingError): @@ -1739,199 +1913,209 @@ def create_store(self, **kwargs): class TestLRUStoreCache(StoreTests): + CountingClass = CountingDict + LRUStoreClass = LRUStoreCache + def create_store(self, **kwargs): # wrapper therefore no dimension_separator argument skip_if_nested_chunks(**kwargs) - return LRUStoreCache(dict(), max_size=2**27) + return self.LRUStoreClass(dict(), max_size=2**27) def test_cache_values_no_max_size(self): # setup store - store = CountingDict() - store['foo'] = b'xxx' - store['bar'] = b'yyy' - assert 0 == store.counter['__getitem__', 'foo'] - assert 1 == store.counter['__setitem__', 'foo'] - assert 0 == store.counter['__getitem__', 'bar'] - assert 1 == store.counter['__setitem__', 'bar'] + store = self.CountingClass() + foo_key = self.root + 'foo' + bar_key = self.root + 'bar' + store[foo_key] = b'xxx' + store[bar_key] = b'yyy' + assert 0 == store.counter['__getitem__', foo_key] + assert 1 == store.counter['__setitem__', foo_key] + assert 0 == store.counter['__getitem__', bar_key] + assert 1 == store.counter['__setitem__', bar_key] # setup cache - cache = LRUStoreCache(store, max_size=None) + cache = self.LRUStoreClass(store, max_size=None) assert 0 == cache.hits assert 0 == cache.misses # test first __getitem__, cache miss - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] - assert 1 == store.counter['__setitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] + assert 1 == store.counter['__setitem__', foo_key] assert 0 == cache.hits assert 1 == cache.misses # test second __getitem__, cache hit - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] - assert 1 == store.counter['__setitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] + assert 1 == store.counter['__setitem__', foo_key] assert 1 == cache.hits assert 1 == cache.misses # test __setitem__, __getitem__ - cache['foo'] = b'zzz' - assert 1 == store.counter['__getitem__', 'foo'] - assert 2 == store.counter['__setitem__', 'foo'] + cache[foo_key] = b'zzz' + assert 1 == store.counter['__getitem__', foo_key] + assert 2 == store.counter['__setitem__', foo_key] # should be a cache hit - assert b'zzz' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] - assert 2 == store.counter['__setitem__', 'foo'] + assert b'zzz' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] + assert 2 == store.counter['__setitem__', foo_key] assert 2 == cache.hits assert 1 == cache.misses # manually invalidate all cached values cache.invalidate_values() - assert b'zzz' == cache['foo'] - assert 2 == store.counter['__getitem__', 'foo'] - assert 2 == store.counter['__setitem__', 'foo'] + assert b'zzz' == cache[foo_key] + assert 2 == store.counter['__getitem__', foo_key] + assert 2 == store.counter['__setitem__', foo_key] cache.invalidate() - assert b'zzz' == cache['foo'] - assert 3 == store.counter['__getitem__', 'foo'] - assert 2 == store.counter['__setitem__', 'foo'] + assert b'zzz' == cache[foo_key] + assert 3 == store.counter['__getitem__', foo_key] + assert 2 == store.counter['__setitem__', foo_key] # test __delitem__ - del cache['foo'] + del cache[foo_key] with pytest.raises(KeyError): # noinspection PyStatementEffect - cache['foo'] + cache[foo_key] with pytest.raises(KeyError): # noinspection PyStatementEffect - store['foo'] + store[foo_key] # verify other keys untouched - assert 0 == store.counter['__getitem__', 'bar'] - assert 1 == store.counter['__setitem__', 'bar'] + assert 0 == store.counter['__getitem__', bar_key] + assert 1 == store.counter['__setitem__', bar_key] def test_cache_values_with_max_size(self): # setup store - store = CountingDict() - store['foo'] = b'xxx' - store['bar'] = b'yyy' - assert 0 == store.counter['__getitem__', 'foo'] - assert 0 == store.counter['__getitem__', 'bar'] + store = self.CountingClass() + foo_key = self.root + 'foo' + bar_key = self.root + 'bar' + store[foo_key] = b'xxx' + store[bar_key] = b'yyy' + assert 0 == store.counter['__getitem__', foo_key] + assert 0 == store.counter['__getitem__', bar_key] # setup cache - can only hold one item - cache = LRUStoreCache(store, max_size=5) + cache = self.LRUStoreClass(store, max_size=5) assert 0 == cache.hits assert 0 == cache.misses # test first 'foo' __getitem__, cache miss - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] assert 0 == cache.hits assert 1 == cache.misses # test second 'foo' __getitem__, cache hit - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] assert 1 == cache.hits assert 1 == cache.misses # test first 'bar' __getitem__, cache miss - assert b'yyy' == cache['bar'] - assert 1 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 1 == store.counter['__getitem__', bar_key] assert 1 == cache.hits assert 2 == cache.misses # test second 'bar' __getitem__, cache hit - assert b'yyy' == cache['bar'] - assert 1 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 1 == store.counter['__getitem__', bar_key] assert 2 == cache.hits assert 2 == cache.misses # test 'foo' __getitem__, should have been evicted, cache miss - assert b'xxx' == cache['foo'] - assert 2 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 2 == store.counter['__getitem__', foo_key] assert 2 == cache.hits assert 3 == cache.misses # test 'bar' __getitem__, should have been evicted, cache miss - assert b'yyy' == cache['bar'] - assert 2 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 2 == store.counter['__getitem__', bar_key] assert 2 == cache.hits assert 4 == cache.misses # setup store - store = CountingDict() - store['foo'] = b'xxx' - store['bar'] = b'yyy' - assert 0 == store.counter['__getitem__', 'foo'] - assert 0 == store.counter['__getitem__', 'bar'] + store = self.CountingClass() + store[foo_key] = b'xxx' + store[bar_key] = b'yyy' + assert 0 == store.counter['__getitem__', foo_key] + assert 0 == store.counter['__getitem__', bar_key] # setup cache - can hold two items - cache = LRUStoreCache(store, max_size=6) + cache = self.LRUStoreClass(store, max_size=6) assert 0 == cache.hits assert 0 == cache.misses # test first 'foo' __getitem__, cache miss - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] assert 0 == cache.hits assert 1 == cache.misses # test second 'foo' __getitem__, cache hit - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] assert 1 == cache.hits assert 1 == cache.misses # test first 'bar' __getitem__, cache miss - assert b'yyy' == cache['bar'] - assert 1 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 1 == store.counter['__getitem__', bar_key] assert 1 == cache.hits assert 2 == cache.misses # test second 'bar' __getitem__, cache hit - assert b'yyy' == cache['bar'] - assert 1 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 1 == store.counter['__getitem__', bar_key] assert 2 == cache.hits assert 2 == cache.misses # test 'foo' __getitem__, should still be cached - assert b'xxx' == cache['foo'] - assert 1 == store.counter['__getitem__', 'foo'] + assert b'xxx' == cache[foo_key] + assert 1 == store.counter['__getitem__', foo_key] assert 3 == cache.hits assert 2 == cache.misses # test 'bar' __getitem__, should still be cached - assert b'yyy' == cache['bar'] - assert 1 == store.counter['__getitem__', 'bar'] + assert b'yyy' == cache[bar_key] + assert 1 == store.counter['__getitem__', bar_key] assert 4 == cache.hits assert 2 == cache.misses def test_cache_keys(self): # setup - store = CountingDict() - store['foo'] = b'xxx' - store['bar'] = b'yyy' - assert 0 == store.counter['__contains__', 'foo'] + store = self.CountingClass() + foo_key = self.root + 'foo' + bar_key = self.root + 'bar' + baz_key = self.root + 'baz' + store[foo_key] = b'xxx' + store[bar_key] = b'yyy' + assert 0 == store.counter['__contains__', foo_key] assert 0 == store.counter['__iter__'] assert 0 == store.counter['keys'] - cache = LRUStoreCache(store, max_size=None) + cache = self.LRUStoreClass(store, max_size=None) # keys should be cached on first call keys = sorted(cache.keys()) - assert keys == ['bar', 'foo'] + assert keys == [bar_key, foo_key] assert 1 == store.counter['keys'] # keys should now be cached assert keys == sorted(cache.keys()) assert 1 == store.counter['keys'] - assert 'foo' in cache - assert 0 == store.counter['__contains__', 'foo'] + assert foo_key in cache + assert 0 == store.counter['__contains__', foo_key] assert keys == sorted(cache) assert 0 == store.counter['__iter__'] assert 1 == store.counter['keys'] # cache should be cleared if store is modified - crude but simple for now - cache['baz'] = b'zzz' + cache[baz_key] = b'zzz' keys = sorted(cache.keys()) - assert keys == ['bar', 'baz', 'foo'] + assert keys == [bar_key, baz_key, foo_key] assert 2 == store.counter['keys'] # keys should now be cached assert keys == sorted(cache.keys()) @@ -1940,25 +2124,25 @@ def test_cache_keys(self): # manually invalidate keys cache.invalidate_keys() keys = sorted(cache.keys()) - assert keys == ['bar', 'baz', 'foo'] + assert keys == [bar_key, baz_key, foo_key] assert 3 == store.counter['keys'] - assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__contains__', foo_key] assert 0 == store.counter['__iter__'] cache.invalidate_keys() keys = sorted(cache) - assert keys == ['bar', 'baz', 'foo'] + assert keys == [bar_key, baz_key, foo_key] assert 4 == store.counter['keys'] - assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__contains__', foo_key] assert 0 == store.counter['__iter__'] cache.invalidate_keys() - assert 'foo' in cache + assert foo_key in cache assert 5 == store.counter['keys'] - assert 0 == store.counter['__contains__', 'foo'] + assert 0 == store.counter['__contains__', foo_key] assert 0 == store.counter['__iter__'] # check these would get counted if called directly - assert 'foo' in store - assert 1 == store.counter['__contains__', 'foo'] + assert foo_key in store + assert 1 == store.counter['__contains__', foo_key] assert keys == sorted(store) assert 1 == store.counter['__iter__'] @@ -2137,9 +2321,11 @@ def test_format_compatibility(): @skip_test_env_var("ZARR_TEST_ABS") class TestABSStore(StoreTests): + ABSStoreClass = ABSStore + def create_store(self, prefix=None, **kwargs): container_client = abs_container() - store = ABSStore( + store = self.ABSStoreClass( prefix=prefix, client=container_client, **kwargs, @@ -2149,7 +2335,9 @@ def create_store(self, prefix=None, **kwargs): def test_non_client_deprecated(self): with pytest.warns(FutureWarning, match='Providing'): - store = ABSStore("container", account_name="account_name", account_key="account_key") + store = self.ABSStoreClass( + "container", account_name="account_name", account_key="account_key" + ) for attr in ["container", "account_name", "account_key"]: with pytest.warns(FutureWarning, match=attr): @@ -2157,7 +2345,13 @@ def test_non_client_deprecated(self): assert result == attr def test_iterators_with_prefix(self): - for prefix in ['test_prefix', '/test_prefix', 'test_prefix/', 'test/prefix', '', None]: + prefixes = ['test_prefix', '/test_prefix', 'test_prefix/', 'test/prefix'] + + if self.version < 3: + # empty prefix not allowed in v3 + prefixes += ['', None] + + for prefix in prefixes: store = self.create_store(prefix=prefix) # test iterator methods on empty store @@ -2167,19 +2361,22 @@ def test_iterators_with_prefix(self): assert set() == set(store.values()) assert set() == set(store.items()) + prefix = meta_root if self.version > 2 else '' # setup some values - store['a'] = b'aaa' - store['b'] = b'bbb' - store['c/d'] = b'ddd' - store['c/e/f'] = b'fff' + store[prefix + 'a'] = b'aaa' + store[prefix + 'b'] = b'bbb' + store[prefix + 'c/d'] = b'ddd' + store[prefix + 'c/e/f'] = b'fff' # test iterators on store with data assert 4 == len(store) - assert {'a', 'b', 'c/d', 'c/e/f'} == set(store) - assert {'a', 'b', 'c/d', 'c/e/f'} == set(store.keys()) - assert {b'aaa', b'bbb', b'ddd', b'fff'} == set(store.values()) - assert ({('a', b'aaa'), ('b', b'bbb'), ('c/d', b'ddd'), ('c/e/f', b'fff')} == - set(store.items())) + keys = [prefix + 'a', prefix + 'b', prefix + 'c/d', prefix + 'c/e/f'] + values = [b'aaa', b'bbb', b'ddd', b'fff'] + items = [(k, v) for k, v in zip(keys, values)] + assert set(keys) == set(store) + assert set(keys) == set(store.keys()) + assert set(values) == set(store.values()) + assert set(items) == set(store.items()) def test_getsize(self): return super().test_getsize() @@ -2195,6 +2392,13 @@ def test_pickle(self): class TestConsolidatedMetadataStore: + version = 2 + ConsolidatedMetadataClass = ConsolidatedMetadataStore + + @property + def metadata_key(self): + return '.zmetadata' + def test_bad_format(self): # setup store with consolidated metadata @@ -2203,11 +2407,15 @@ def test_bad_format(self): # bad format version 'zarr_consolidated_format': 0, } - store['.zmetadata'] = json.dumps(consolidated).encode() + store[self.metadata_key] = json.dumps(consolidated).encode() # check appropriate error is raised with pytest.raises(MetadataError): - ConsolidatedMetadataStore(store) + self.ConsolidatedMetadataClass(store) + + def test_bad_store_version(self): + with pytest.raises(ValueError): + self.ConsolidatedMetadataClass(KVStoreV3(dict())) def test_read_write(self): @@ -2220,10 +2428,10 @@ def test_read_write(self): 'baz': 42, } } - store['.zmetadata'] = json.dumps(consolidated).encode() + store[self.metadata_key] = json.dumps(consolidated).encode() # create consolidated store - cs = ConsolidatedMetadataStore(store) + cs = self.ConsolidatedMetadataClass(store) # test __contains__, __getitem__ for key, value in consolidated['metadata'].items(): @@ -2252,3 +2460,24 @@ def test_fill_value_change(): assert a[0, 0] == 1 assert json.loads(a.store[".zarray"])["fill_value"] == 1 + + +def test_get_hierarchy_metadata_v2(): + # v2 stores do not have hierarchy metadata (i.e. zarr.json) + with pytest.raises(ValueError): + _get_hierarchy_metadata(KVStore(dict)) + + +def test_normalize_store_arg(tmpdir): + with pytest.raises(ValueError): + normalize_store_arg(dict(), zarr_version=4) + + for ext, Class in [('.zip', ZipStore), ('.n5', N5Store)]: + fn = tmpdir.join('store' + ext) + store = normalize_store_arg(str(fn), zarr_version=2, mode='w') + assert isinstance(store, Class) + + if have_fsspec: + path = tempfile.mkdtemp() + store = normalize_store_arg("file://" + path, zarr_version=2, mode='w') + assert isinstance(store, FSStore) diff --git a/zarr/tests/test_storage_v3.py b/zarr/tests/test_storage_v3.py new file mode 100644 index 0000000000..73fda1b758 --- /dev/null +++ b/zarr/tests/test_storage_v3.py @@ -0,0 +1,513 @@ +import array +import atexit +import copy +import os +import tempfile + +import numpy as np +import pytest +from zarr._storage.store import _get_hierarchy_metadata +from zarr.meta import _default_entry_point_metadata_v3 +from zarr.storage import (ABSStoreV3, ConsolidatedMetadataStoreV3, DBMStoreV3, + DirectoryStoreV3, FSStoreV3, KVStore, KVStoreV3, + LMDBStoreV3, LRUStoreCacheV3, MemoryStoreV3, + MongoDBStoreV3, RedisStoreV3, SQLiteStoreV3, StoreV3, + ZipStoreV3, atexit_rmglob, atexit_rmtree, data_root, + default_compressor, getsize, init_array, meta_root, + normalize_store_arg) +from zarr.tests.util import CountingDictV3, have_fsspec, skip_test_env_var + +# pytest will fail to run if the following fixtures aren't imported here +from .test_storage import StoreTests as _StoreTests +from .test_storage import TestABSStore as _TestABSStore +from .test_storage import TestConsolidatedMetadataStore as _TestConsolidatedMetadataStore +from .test_storage import TestDBMStore as _TestDBMStore +from .test_storage import TestDBMStoreBerkeleyDB as _TestDBMStoreBerkeleyDB +from .test_storage import TestDBMStoreDumb as _TestDBMStoreDumb +from .test_storage import TestDBMStoreGnu as _TestDBMStoreGnu +from .test_storage import TestDBMStoreNDBM as _TestDBMStoreNDBM +from .test_storage import TestDirectoryStore as _TestDirectoryStore +from .test_storage import TestFSStore as _TestFSStore +from .test_storage import TestLMDBStore as _TestLMDBStore +from .test_storage import TestLRUStoreCache as _TestLRUStoreCache +from .test_storage import TestMemoryStore as _TestMemoryStore +from .test_storage import TestSQLiteStore as _TestSQLiteStore +from .test_storage import TestSQLiteStoreInMemory as _TestSQLiteStoreInMemory +from .test_storage import TestZipStore as _TestZipStore +from .test_storage import (dimension_separator_fixture, s3, # noqa + skip_if_nested_chunks) + + +@pytest.fixture(params=[ + (None, "/"), + (".", "."), + ("/", "/"), +]) +def dimension_separator_fixture_v3(request): + return request.param + + +class DummyStore(): + # contains all methods expected of Mutable Mapping + + def keys(self): + """keys""" + + def values(self): + """values""" + + def get(self, value, default=None): + """get""" + + def __setitem__(self, key, value): + """__setitem__""" + + def __getitem__(self, key): + """__getitem__""" + + def __delitem__(self, key): + """__delitem__""" + + def __contains__(self, key): + """__contains__""" + + +class InvalidDummyStore(): + # does not contain expected methods of a MutableMapping + + def keys(self): + """keys""" + + +def test_ensure_store_v3(): + class InvalidStore: + pass + + with pytest.raises(ValueError): + StoreV3._ensure_store(InvalidStore()) + + # cannot initialize with a store from a different Zarr version + with pytest.raises(ValueError): + StoreV3._ensure_store(KVStore(dict())) + + assert StoreV3._ensure_store(None) is None + + # class with all methods of a MutableMapping will become a KVStoreV3 + assert isinstance(StoreV3._ensure_store(DummyStore), KVStoreV3) + + with pytest.raises(ValueError): + # does not have the methods expected of a MutableMapping + StoreV3._ensure_store(InvalidDummyStore) + + +def test_valid_key(): + store = KVStoreV3(dict) + + # only ascii keys are valid + assert not store._valid_key(5) + assert not store._valid_key(2.8) + + for key in store._valid_key_characters: + assert store._valid_key(key) + + # other characters not in store._valid_key_characters are not allowed + assert not store._valid_key('*') + assert not store._valid_key('~') + assert not store._valid_key('^') + + +def test_validate_key(): + store = KVStoreV3(dict) + + # zarr.json is a valid key + store._validate_key('zarr.json') + # but other keys not starting with meta/ or data/ are not + with pytest.raises(ValueError): + store._validate_key('zar.json') + + # valid ascii keys + for valid in [meta_root + 'arr1.array.json', + data_root + 'arr1.array.json', + meta_root + 'subfolder/item_1-0.group.json']: + store._validate_key(valid) + # but otherwise valid keys cannot end in / + with pytest.raises(ValueError): + assert store._validate_key(valid + '/') + + for invalid in [0, '*', '~', '^', '&']: + with pytest.raises(ValueError): + store._validate_key(invalid) + + +class StoreV3Tests(_StoreTests): + + version = 3 + root = meta_root + + def test_getsize(self): + # TODO: determine proper getsize() behavior for v3 + # Currently returns the combined size of entries under + # meta/root/path and data/root/path. + # Any path not under meta/root/ or data/root/ (including zarr.json) + # returns size 0. + + store = self.create_store() + if isinstance(store, dict) or hasattr(store, 'getsize'): + assert 0 == getsize(store, 'zarr.json') + store[meta_root + 'foo/a'] = b'x' + assert 1 == getsize(store) + assert 1 == getsize(store, 'foo') + store[meta_root + 'foo/b'] = b'x' + assert 2 == getsize(store, 'foo') + assert 1 == getsize(store, 'foo/b') + store[meta_root + 'bar/a'] = b'yy' + assert 2 == getsize(store, 'bar') + store[data_root + 'bar/a'] = b'zzz' + assert 5 == getsize(store, 'bar') + store[data_root + 'baz/a'] = b'zzz' + assert 3 == getsize(store, 'baz') + assert 10 == getsize(store) + store[data_root + 'quux'] = array.array('B', b'zzzz') + assert 14 == getsize(store) + assert 4 == getsize(store, 'quux') + store[data_root + 'spong'] = np.frombuffer(b'zzzzz', dtype='u1') + assert 19 == getsize(store) + assert 5 == getsize(store, 'spong') + store.close() + + def test_init_array(self, dimension_separator_fixture_v3): + + pass_dim_sep, want_dim_sep = dimension_separator_fixture_v3 + + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100, + dimension_separator=pass_dim_sep) + + # check metadata + mkey = meta_root + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert default_compressor == meta['compressor'] + assert meta['fill_value'] is None + # Missing MUST be assumed to be "/" + assert meta['chunk_grid']['separator'] is want_dim_sep + store.close() + + def test_list_prefix(self): + + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + expected = [meta_root + 'arr1.array.json', 'zarr.json'] + assert sorted(store.list_prefix('')) == expected + + expected = [meta_root + 'arr1.array.json'] + assert sorted(store.list_prefix(meta_root.rstrip('/'))) == expected + + # cannot start prefix with '/' + with pytest.raises(ValueError): + store.list_prefix(prefix='/' + meta_root.rstrip('/')) + + def test_equal(self): + store = self.create_store() + assert store == store + + def test_rename_nonexisting(self): + store = self.create_store() + if store.is_erasable(): + with pytest.raises(ValueError): + store.rename('a', 'b') + else: + with pytest.raises(NotImplementedError): + store.rename('a', 'b') + + +class TestMappingStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + return KVStoreV3(dict()) + + def test_set_invalid_content(self): + # Generic mappings support non-buffer types + pass + + +class TestMemoryStoreV3(_TestMemoryStore, StoreV3Tests): + + def create_store(self, **kwargs): + skip_if_nested_chunks(**kwargs) + return MemoryStoreV3(**kwargs) + + +class TestDirectoryStoreV3(_TestDirectoryStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, **kwargs): + # For v3, don't have to skip if nested. + # skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + store = DirectoryStoreV3(path, normalize_keys=normalize_keys, **kwargs) + return store + + def test_rename_nonexisting(self): + store = self.create_store() + with pytest.raises(FileNotFoundError): + store.rename(meta_root + 'a', meta_root + 'b') + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3(_TestFSStore, StoreV3Tests): + + def create_store(self, normalize_keys=False, + dimension_separator=".", + path=None, + **kwargs): + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + store = FSStoreV3( + path, + normalize_keys=normalize_keys, + dimension_separator=dimension_separator, + **kwargs) + return store + + def test_init_array(self): + store = self.create_store() + path = 'arr1' + init_array(store, path=path, shape=1000, chunks=100) + + # check metadata + mkey = meta_root + path + '.array.json' + assert mkey in store + meta = store._metadata_class.decode_array_metadata(store[mkey]) + assert (1000,) == meta['shape'] + assert (100,) == meta['chunk_grid']['chunk_shape'] + assert np.dtype(None) == meta['data_type'] + assert meta['chunk_grid']['separator'] == "/" + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreV3WithKeySeparator(StoreV3Tests): + + def create_store(self, normalize_keys=False, key_separator=".", **kwargs): + + # Since the user is passing key_separator, that will take priority. + skip_if_nested_chunks(**kwargs) + + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + return FSStoreV3( + path, + normalize_keys=normalize_keys, + key_separator=key_separator) + + +# TODO: enable once N5StoreV3 has been implemented +# @pytest.mark.skipif(True, reason="N5StoreV3 not yet fully implemented") +# class TestN5StoreV3(_TestN5Store, TestDirectoryStoreV3, StoreV3Tests): + + +class TestZipStoreV3(_TestZipStore, StoreV3Tests): + + ZipStoreClass = ZipStoreV3 + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.zip') + atexit.register(os.remove, path) + store = ZipStoreV3(path, mode='w', **kwargs) + return store + + +class TestDBMStoreV3(_TestDBMStore, StoreV3Tests): + + def create_store(self, dimension_separator=None): + path = tempfile.mktemp(suffix='.anydbm') + atexit.register(atexit_rmglob, path + '*') + # create store using default dbm implementation + store = DBMStoreV3(path, flag='n', dimension_separator=dimension_separator) + return store + + +class TestDBMStoreV3Dumb(_TestDBMStoreDumb, StoreV3Tests): + + def create_store(self, **kwargs): + path = tempfile.mktemp(suffix='.dumbdbm') + atexit.register(atexit_rmglob, path + '*') + + import dbm.dumb as dumbdbm + store = DBMStoreV3(path, flag='n', open=dumbdbm.open, **kwargs) + return store + + +class TestDBMStoreV3Gnu(_TestDBMStoreGnu, StoreV3Tests): + + def create_store(self, **kwargs): + gdbm = pytest.importorskip("dbm.gnu") + path = tempfile.mktemp(suffix=".gdbm") # pragma: no cover + atexit.register(os.remove, path) # pragma: no cover + store = DBMStoreV3( + path, flag="n", open=gdbm.open, write_lock=False, **kwargs + ) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3NDBM(_TestDBMStoreNDBM, StoreV3Tests): + + def create_store(self, **kwargs): + ndbm = pytest.importorskip("dbm.ndbm") + path = tempfile.mktemp(suffix=".ndbm") # pragma: no cover + atexit.register(atexit_rmglob, path + "*") # pragma: no cover + store = DBMStoreV3(path, flag="n", open=ndbm.open, **kwargs) # pragma: no cover + return store # pragma: no cover + + +class TestDBMStoreV3BerkeleyDB(_TestDBMStoreBerkeleyDB, StoreV3Tests): + + def create_store(self, **kwargs): + bsddb3 = pytest.importorskip("bsddb3") + path = tempfile.mktemp(suffix='.dbm') + atexit.register(os.remove, path) + store = DBMStoreV3(path, flag='n', open=bsddb3.btopen, write_lock=False, **kwargs) + return store + + +class TestLMDBStoreV3(_TestLMDBStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("lmdb") + path = tempfile.mktemp(suffix='.lmdb') + atexit.register(atexit_rmtree, path) + buffers = True + store = LMDBStoreV3(path, buffers=buffers, **kwargs) + return store + + +class TestSQLiteStoreV3(_TestSQLiteStore, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + path = tempfile.mktemp(suffix='.db') + atexit.register(atexit_rmtree, path) + store = SQLiteStoreV3(path, **kwargs) + return store + + +class TestSQLiteStoreV3InMemory(_TestSQLiteStoreInMemory, StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("sqlite3") + store = SQLiteStoreV3(':memory:', **kwargs) + return store + + +@skip_test_env_var("ZARR_TEST_MONGO") +class TestMongoDBStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + pytest.importorskip("pymongo") + store = MongoDBStoreV3(host='127.0.0.1', database='zarr_tests', + collection='zarr_tests', **kwargs) + # start with an empty store + store.clear() + return store + + +@skip_test_env_var("ZARR_TEST_REDIS") +class TestRedisStoreV3(StoreV3Tests): + + def create_store(self, **kwargs): + # TODO: this is the default host for Redis on Travis, + # we probably want to generalize this though + pytest.importorskip("redis") + store = RedisStoreV3(host='localhost', port=6379, **kwargs) + # start with an empty store + store.clear() + return store + + +class TestLRUStoreCacheV3(_TestLRUStoreCache, StoreV3Tests): + + CountingClass = CountingDictV3 + LRUStoreClass = LRUStoreCacheV3 + + +@skip_test_env_var("ZARR_TEST_ABS") +class TestABSStoreV3(_TestABSStore, StoreV3Tests): + + ABSStoreClass = ABSStoreV3 + + +def test_normalize_store_arg_v3(tmpdir): + + fn = tmpdir.join('store.zip') + store = normalize_store_arg(str(fn), zarr_version=3, mode='w') + assert isinstance(store, ZipStoreV3) + assert 'zarr.json' in store + + # can't pass storage_options to non-fsspec store + with pytest.raises(ValueError): + normalize_store_arg(str(fn), zarr_version=3, mode='w', storage_options={"some": "kwargs"}) + + if have_fsspec: + path = tempfile.mkdtemp() + store = normalize_store_arg("file://" + path, zarr_version=3, mode='w') + assert isinstance(store, FSStoreV3) + assert 'zarr.json' in store + + fn = tmpdir.join('store.n5') + with pytest.raises(NotImplementedError): + normalize_store_arg(str(fn), zarr_version=3, mode='w') + + # error on zarr_version=3 with a v2 store + with pytest.raises(ValueError): + normalize_store_arg(KVStore(dict()), zarr_version=3, mode='w') + + # error on zarr_version=2 with a v3 store + with pytest.raises(ValueError): + normalize_store_arg(KVStoreV3(dict()), zarr_version=2, mode='w') + + +class TestConsolidatedMetadataStoreV3(_TestConsolidatedMetadataStore): + + version = 3 + ConsolidatedMetadataClass = ConsolidatedMetadataStoreV3 + + @property + def metadata_key(self): + return meta_root + 'consolidated/.zmetadata' + + def test_bad_store_version(self): + with pytest.raises(ValueError): + self.ConsolidatedMetadataClass(KVStore(dict())) + + +def test_get_hierarchy_metadata(): + store = KVStoreV3({}) + + # error raised if 'jarr.json' is not in the store + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) + + store['zarr.json'] = _default_entry_point_metadata_v3 + assert _get_hierarchy_metadata(store) == _default_entry_point_metadata_v3 + + # ValueError if only a subset of keys are present + store['zarr.json'] = {'zarr_format': 'https://purl.org/zarr/spec/protocol/core/3.0'} + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) + + # ValueError if any unexpected keys are present + extra_metadata = copy.copy(_default_entry_point_metadata_v3) + extra_metadata['extra_key'] = 'value' + store['zarr.json'] = extra_metadata + with pytest.raises(ValueError): + _get_hierarchy_metadata(store) diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py index 69fc0d7708..b2bd9e35bb 100644 --- a/zarr/tests/test_sync.py +++ b/zarr/tests/test_sync.py @@ -13,17 +13,18 @@ from zarr.core import Array from zarr.hierarchy import Group from zarr.storage import (DirectoryStore, KVStore, atexit_rmtree, init_array, - init_group) + init_group, meta_root) from zarr.sync import ProcessSynchronizer, ThreadSynchronizer -from zarr.tests.test_attrs import TestAttributes +# zarr_version fixture must be imported although not used directly here +from zarr.tests.test_attrs import TestAttributes, zarr_version # noqa from zarr.tests.test_core import TestArray from zarr.tests.test_hierarchy import TestGroup class TestAttributesWithThreadSynchronizer(TestAttributes): - def init_attributes(self, store, read_only=False, cache=True): - key = 'attrs' + def init_attributes(self, store, read_only=False, cache=True, zarr_version=zarr_version): + key = '.zattrs' if zarr_version == 2 else meta_root + 'attrs' synchronizer = ThreadSynchronizer() return Attributes(store, synchronizer=synchronizer, key=key, read_only=read_only, cache=cache) @@ -31,8 +32,8 @@ def init_attributes(self, store, read_only=False, cache=True): class TestAttributesProcessSynchronizer(TestAttributes): - def init_attributes(self, store, read_only=False, cache=True): - key = 'attrs' + def init_attributes(self, store, read_only=False, cache=True, zarr_version=zarr_version): + key = '.zattrs' if zarr_version == 2 else meta_root + 'attrs' sync_path = mkdtemp() atexit.register(shutil.rmtree, sync_path) synchronizer = ProcessSynchronizer(sync_path) diff --git a/zarr/tests/util.py b/zarr/tests/util.py index e0f11d72ad..bb4df90d1b 100644 --- a/zarr/tests/util.py +++ b/zarr/tests/util.py @@ -1,7 +1,7 @@ import collections import os -from zarr.storage import Store +from zarr.storage import Store, StoreV3 import pytest @@ -41,6 +41,10 @@ def __delitem__(self, key): del self.wrapped[key] +class CountingDictV3(CountingDict, StoreV3): + pass + + def skip_test_env_var(name): """ Checks for environment variables indicating whether tests requiring services should be run """