diff --git a/docs/release.rst b/docs/release.rst index 7a5cf51db7..cd9ebbd784 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,6 +9,27 @@ Unreleased Bug fixes ~~~~~~~~~ +* Fix bug that made it impossible to create an ``FSStore`` on unlistable filesystems + (e.g. some HTTP servers). + By :user:`Ryan Abernathey `; :issue:`993`. + +Enhancements +~~~~~~~~~~~~ + +* **Create FSStore from an existing fsspec filesystem**. If you have created + an fsspec filesystem outside of Zarr, you can now pass it as a keyword + argument to ``FSStore``. + By :user:`Ryan Abernathey `. + + +.. _release_2.11.3: + +2.11.3 +------ + +Bug fixes +~~~~~~~~~ + * Changes the default value of ``write_empty_chunks`` to ``True`` to prevent unanticipated data losses when the data types do not have a proper default value when empty chunks are read back in. @@ -322,7 +343,7 @@ Bug fixes * FSStore: default to normalize_keys=False By :user:`Josh Moore `; :issue:`755`. -* ABSStore: compatibility with ``azure.storage.python>=12`` +* ABSStore: compatibility with ``azure.storage.python>=12`` By :user:`Tom Augspurger `; :issue:`618` @@ -487,7 +508,7 @@ This release will be the last to support Python 3.5, next version of Zarr will b * `DirectoryStore` now uses `os.scandir`, which should make listing large store faster, :issue:`563` - + * Remove a few remaining Python 2-isms. By :user:`Poruri Sai Rahul `; :issue:`393`. @@ -507,7 +528,7 @@ This release will be the last to support Python 3.5, next version of Zarr will b ``zarr.errors`` have been replaced by ``ValueError`` subclasses. The corresponding ``err_*`` function have been removed. :issue:`590`, :issue:`614`) -* Improve consistency of terminology regarding arrays and datasets in the +* Improve consistency of terminology regarding arrays and datasets in the documentation. By :user:`Josh Moore `; :issue:`571`. diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 53ddddb0b9..b40896c78c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -758,7 +758,7 @@ databases. The :class:`zarr.storage.RedisStore` class interfaces `Redis `_ (an object oriented NoSQL database). These stores respectively require the `redis-py `_ and -`pymongo `_ packages to be installed. +`pymongo `_ packages to be installed. For compatibility with the `N5 `_ data format, Zarr also provides an N5 backend (this is currently an experimental feature). Similar to the zip storage class, an @@ -897,6 +897,18 @@ The second invocation here will be much faster. Note that the ``storage_options` have become more complex here, to account for the two parts of the supplied URL. +It is also possible to initialize the filesytem outside of Zarr and then pass +it through. This requires creating an :class:`zarr.storage.FSStore` object +explicitly. For example:: + + >>> import s3fs * doctest: +SKIP + >>> fs = s3fs.S3FileSystem(anon=True) # doctest: +SKIP + >>> store = zarr.storage.FSStore('/zarr-demo/store', fs=fs) # doctest: +SKIP + >>> g = zarr.open_group(store) # doctest: +SKIP + +This is useful in cases where you want to also use the same fsspec filesystem object +separately from Zarr. + .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ .. _supported by fsspec: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations @@ -1306,18 +1318,18 @@ filters (e.g., byte-shuffle) have been applied. Empty chunks ~~~~~~~~~~~~ - + As of version 2.11, it is possible to configure how Zarr handles the storage of chunks that are "empty" (i.e., every element in the chunk is equal to the array's fill value). -When creating an array with ``write_empty_chunks=False``, +When creating an array with ``write_empty_chunks=False``, Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty, -then Zarr does not store it, and instead deletes the chunk from storage -if the chunk had been previously stored. +then Zarr does not store it, and instead deletes the chunk from storage +if the chunk had been previously stored. -This optimization prevents storing redundant objects and can speed up reads, but the cost is -added computation during array writes, since the contents of -each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. -If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. +This optimization prevents storing redundant objects and can speed up reads, but the cost is +added computation during array writes, since the contents of +each chunk must be compared to the fill value, and these advantages are contingent on the content of the array. +If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above. In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness. The following example illustrates the effect of the ``write_empty_chunks`` flag on @@ -1329,7 +1341,7 @@ the time required to write an array with different values.:: >>> from tempfile import TemporaryDirectory >>> def timed_write(write_empty_chunks): ... """ - ... Measure the time required and number of objects created when writing + ... Measure the time required and number of objects created when writing ... to a Zarr array with random ints or fill value. ... """ ... chunks = (8192,) @@ -1368,8 +1380,8 @@ the time required to write an array with different values.:: Random Data: 0.1359s, 1024 objects stored Empty Data: 0.0301s, 0 objects stored -In this example, writing random data is slightly slower with ``write_empty_chunks=True``, -but writing empty data is substantially faster and generates far fewer objects in storage. +In this example, writing random data is slightly slower with ``write_empty_chunks=True``, +but writing empty data is substantially faster and generates far fewer objects in storage. .. _tutorial_rechunking: diff --git a/zarr/storage.py b/zarr/storage.py index e8a65147aa..48b6f049dd 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1262,8 +1262,9 @@ class FSStore(Store): Parameters ---------- url : str - The destination to map. Should include protocol and path, - like "s3://bucket/root" + The destination to map. If no fs is provided, should include protocol + and path, like "s3://bucket/root". If an fs is provided, can be a path + within that filesystem, like "bucket/root" normalize_keys : bool key_separator : str public API for accessing dimension_separator. Never `None` @@ -1275,7 +1276,19 @@ class FSStore(Store): as a missing key dimension_separator : {'.', '/'}, optional Separator placed between the dimensions of a chunk. - storage_options : passed to the fsspec implementation + fs : fsspec.spec.AbstractFileSystem, optional + An existing filesystem to use for the store. + check : bool, optional + If True, performs a touch at the root location, to check for write access. + Passed to `fsspec.mapping.FSMap` constructor. + create : bool, optional + If True, performs a mkdir at the rool location. + Passed to `fsspec.mapping.FSMap` constructor. + missing_exceptions : sequence of Exceptions, optional + Exceptions classes to associate with missing files. + Passed to `fsspec.mapping.FSMap` constructor. + storage_options : passed to the fsspec implementation. Cannot be used + together with fs. """ _array_meta_key = array_meta_key _group_meta_key = group_meta_key @@ -1285,18 +1298,37 @@ def __init__(self, url, normalize_keys=False, key_separator=None, mode='w', exceptions=(KeyError, PermissionError, IOError), dimension_separator=None, + fs=None, + check=False, + create=False, + missing_exceptions=None, **storage_options): import fsspec - self.normalize_keys = normalize_keys - protocol, _ = fsspec.core.split_protocol(url) - # set auto_mkdir to True for local file system - if protocol in (None, "file") and not storage_options.get("auto_mkdir"): - storage_options["auto_mkdir"] = True + mapper_options = {"check": check, "create": create} + # https://github.com/zarr-developers/zarr-python/pull/911#discussion_r841926292 + # Some fsspec implementations don't accept missing_exceptions. + # This is a workaround to avoid passing it in the most common scenarios. + # Remove this and add missing_exceptions to mapper_options when fsspec is released. + if missing_exceptions is not None: + mapper_options["missing_exceptions"] = missing_exceptions # pragma: no cover + + if fs is None: + protocol, _ = fsspec.core.split_protocol(url) + # set auto_mkdir to True for local file system + if protocol in (None, "file") and not storage_options.get("auto_mkdir"): + storage_options["auto_mkdir"] = True + self.map = fsspec.get_mapper(url, **{**mapper_options, **storage_options}) + self.fs = self.map.fs # for direct operations + self.path = self.fs._strip_protocol(url) + else: + if storage_options: + raise ValueError("Cannot specify both fs and storage_options") + self.fs = fs + self.path = self.fs._strip_protocol(url) + self.map = self.fs.get_mapper(self.path, **mapper_options) - self.map = fsspec.get_mapper(url, **storage_options) - self.fs = self.map.fs # for direct operations - self.path = self.fs._strip_protocol(url) + self.normalize_keys = normalize_keys self.mode = mode self.exceptions = exceptions # For backwards compatibility. Guaranteed to be non-None @@ -1308,8 +1340,6 @@ def __init__(self, url, normalize_keys=False, key_separator=None, # Pass attributes to array creation self._dimension_separator = dimension_separator - if self.fs.exists(self.path) and not self.fs.isdir(self.path): - raise FSPathExistNotDir(url) def _default_key_separator(self): if self.key_separator is None: diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 097512a240..ce8f03d0da 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -26,6 +26,7 @@ from zarr.hierarchy import Group, group from zarr.storage import ( ConsolidatedMetadataStore, + FSStore, KVStore, MemoryStore, atexit_rmtree, @@ -205,9 +206,18 @@ def test_tree(zarr_version): @pytest.mark.parametrize('zarr_version', [2, 3]) -@pytest.mark.parametrize('with_chunk_store', [False, True], ids=['default', 'with_chunk_store']) @pytest.mark.parametrize('stores_from_path', [False, True]) -def test_consolidate_metadata(with_chunk_store, zarr_version, stores_from_path): +@pytest.mark.parametrize( + 'with_chunk_store,listable', + [(False, True), (True, True), (False, False)], + ids=['default-listable', 'with_chunk_store-listable', 'default-unlistable'] +) +def test_consolidate_metadata(with_chunk_store, + zarr_version, + listable, + monkeypatch, + stores_from_path): + # setup initial data if stores_from_path: store = tempfile.mkdtemp() @@ -228,6 +238,10 @@ def test_consolidate_metadata(with_chunk_store, zarr_version, stores_from_path): version_kwarg = {} path = 'dataset' if zarr_version == 3 else None z = group(store, chunk_store=chunk_store, path=path, **version_kwarg) + + # Reload the actual store implementation in case str + store_to_copy = z.store + z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' @@ -278,14 +292,36 @@ def test_consolidate_metadata(with_chunk_store, zarr_version, stores_from_path): for key in meta_keys: del store[key] + # https://github.com/zarr-developers/zarr-python/issues/993 + # Make sure we can still open consolidated on an unlistable store: + if not listable: + fs_memory = pytest.importorskip("fsspec.implementations.memory") + monkeypatch.setattr(fs_memory.MemoryFileSystem, "isdir", lambda x, y: False) + monkeypatch.delattr(fs_memory.MemoryFileSystem, "ls") + fs = fs_memory.MemoryFileSystem() + if zarr_version == 2: + store_to_open = FSStore("", fs=fs) + else: + store_to_open = FSStoreV3("", fs=fs) + + # copy original store to new unlistable store + store_to_open.update(store_to_copy) + + else: + store_to_open = store + # open consolidated - z2 = open_consolidated(store, chunk_store=chunk_store, path=path, **version_kwarg) + z2 = open_consolidated(store_to_open, chunk_store=chunk_store, path=path, **version_kwarg) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks - assert 16 == z2.g2.arr.nchunks_initialized + if listable: + assert 16 == z2.g2.arr.nchunks_initialized + else: + with pytest.raises(NotImplementedError): + _ = z2.g2.arr.nchunks_initialized if stores_from_path: # path string is note a BaseStore subclass so cannot be used to diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index ebb24a07ed..2212b035c2 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -2495,6 +2495,13 @@ def test_store_has_bytes_values(self): pass +fsspec_mapper_kwargs = { + "check": True, + "create": True, + "missing_exceptions": None +} + + @pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") class TestArrayWithFSStore(TestArray): @staticmethod @@ -2502,7 +2509,35 @@ def create_array(read_only=False, **kwargs): path = mkdtemp() atexit.register(shutil.rmtree, path) key_separator = kwargs.pop('key_separator', ".") - store = FSStore(path, key_separator=key_separator, auto_mkdir=True) + store = FSStore(path, key_separator=key_separator, auto_mkdir=True, **fsspec_mapper_kwargs) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) + kwargs.setdefault('compressor', Blosc()) + init_array(store, **kwargs) + return Array(store, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) + + def expected(self): + return [ + "ab753fc81df0878589535ca9bad2816ba88d91bc", + "c16261446f9436b1e9f962e57ce3e8f6074abe8a", + "c2ef3b2fb2bc9dcace99cd6dad1a7b66cc1ea058", + "6e52f95ac15b164a8e96843a230fcee0e610729b", + "091fa99bc60706095c9ce30b56ce2503e0223f56", + ] + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStoreFromFilesystem(TestArray): + @staticmethod + def create_array(read_only=False, **kwargs): + from fsspec.implementations.local import LocalFileSystem + fs = LocalFileSystem(auto_mkdir=True) + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = kwargs.pop('key_separator', ".") + store = FSStore(path, fs=fs, key_separator=key_separator, **fsspec_mapper_kwargs) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) write_empty_chunks = kwargs.pop('write_empty_chunks', True) @@ -3148,7 +3183,40 @@ def create_array(array_path='arr1', read_only=False, **kwargs): path = mkdtemp() atexit.register(shutil.rmtree, path) key_separator = kwargs.pop('key_separator', ".") - store = FSStoreV3(path, key_separator=key_separator, auto_mkdir=True) + store = FSStoreV3( + path, + key_separator=key_separator, + auto_mkdir=True, + **fsspec_mapper_kwargs + ) + cache_metadata = kwargs.pop('cache_metadata', True) + cache_attrs = kwargs.pop('cache_attrs', True) + write_empty_chunks = kwargs.pop('write_empty_chunks', True) + kwargs.setdefault('compressor', Blosc()) + init_array(store, path=array_path, **kwargs) + return Array(store, path=array_path, read_only=read_only, cache_metadata=cache_metadata, + cache_attrs=cache_attrs, write_empty_chunks=write_empty_chunks) + + def expected(self): + return [ + "1509abec4285494b61cd3e8d21f44adc3cf8ddf6", + "7cfb82ec88f7ecb7ab20ae3cb169736bc76332b8", + "b663857bb89a8ab648390454954a9cdd453aa24b", + "21e90fa927d09cbaf0e3b773130e2dc05d18ff9b", + "e8c1fdd18b5c2ee050b59d0c8c95d07db642459c", + ] + + +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestArrayWithFSStoreV3FromFilesystem(TestArrayWithPathV3, TestArrayWithFSStore): + @staticmethod + def create_array(array_path='arr1', read_only=False, **kwargs): + from fsspec.implementations.local import LocalFileSystem + fs = LocalFileSystem(auto_mkdir=True) + path = mkdtemp() + atexit.register(shutil.rmtree, path) + key_separator = kwargs.pop('key_separator', ".") + store = FSStoreV3(path, fs=fs, key_separator=key_separator, **fsspec_mapper_kwargs) cache_metadata = kwargs.pop('cache_metadata', True) cache_attrs = kwargs.pop('cache_attrs', True) write_empty_chunks = kwargs.pop('write_empty_chunks', True) diff --git a/zarr/tests/test_storage.py b/zarr/tests/test_storage.py index 9fb869d5e3..abddb6965c 100644 --- a/zarr/tests/test_storage.py +++ b/zarr/tests/test_storage.py @@ -1249,10 +1249,6 @@ def test_read_only(self): assert store[self.root + 'foo'] == b"bar" - filepath = os.path.join(path, self.root + "foo") - with pytest.raises(ValueError): - self.create_store(path=filepath, mode='r') - def test_eq(self): store1 = self.create_store(path="anypath") store2 = self.create_store(path="anypath") @@ -1339,6 +1335,35 @@ def create_store(self, normalize_keys=False, key_separator=".", **kwargs): key_separator=key_separator) +@pytest.mark.skipif(have_fsspec is False, reason="needs fsspec") +class TestFSStoreFromFilesystem(StoreTests): + + def create_store(self, normalize_keys=False, + dimension_separator=".", + path=None, + **kwargs): + import fsspec + fs = fsspec.filesystem("file") + + if path is None: + path = tempfile.mkdtemp() + atexit.register(atexit_rmtree, path) + + with pytest.raises(ValueError): + # can't specify storage_options when passing an + # existing fs object + _ = FSStore(path, fs=fs, auto_mkdir=True) + + store = FSStore( + path, + normalize_keys=normalize_keys, + dimension_separator=dimension_separator, + fs=fs, + **kwargs) + + return store + + @pytest.fixture() def s3(request): # writable local S3 system