Merge branch 'main' into docs/3.0-async-guide

zarr-developers · Jan 3, 2025 · cc31f09 · cc31f09
2 parents 77da71f + d6384f5
commit cc31f09
Show file tree

Hide file tree

Showing 16 changed files with 300 additions and 194 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -99,6 +99,8 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0 # required for hatch version discovery, which is needed for numcodecs.zarr3
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

diff --git a/.gitignore b/.gitignore
@@ -54,6 +54,7 @@ docs/_build/
 docs/_autoapi
 docs/data
 data
+data.zip
 
 # PyBuilder
 target/

diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst
diff --git a/docs/user-guide/attributes.rst b/docs/user-guide/attributes.rst
@@ -7,10 +7,10 @@ Zarr arrays and groups support custom key/value attributes, which can be useful
 storing application-specific metadata. For example::
 
    >>> import zarr
-   >>> # TODO: replace with create_group after #2463
-   >>> root = zarr.group()
+   >>> store = zarr.storage.MemoryStore()
+   >>> root = zarr.create_group(store=store)
    >>> root.attrs['foo'] = 'bar'
-   >>> z = root.zeros(name='zzz', shape=(10000, 10000))
+   >>> z = root.create_array(name='zzz', shape=(10000, 10000), dtype='int32')
    >>> z.attrs['baz'] = 42
    >>> z.attrs['qux'] = [1, 4, 7, 12]
    >>> sorted(root.attrs)

diff --git a/docs/user-guide/config.rst b/docs/user-guide/config.rst
@@ -10,11 +10,11 @@ Configuration values can be set using code like the following::
 
    >>> import zarr
    >>>
-   >>> zarr.config.set({"array.order": "F"})
+   >>> zarr.config.set({'array.order': 'F'})
    <donfig.config_obj.ConfigSet object at ...>
    >>>
    >>> # revert this change so it doesn't impact the rest of the docs
-   >>> zarr.config.set({"array.order": "C"})
+   >>> zarr.config.set({'array.order': 'C'})
    <donfig.config_obj.ConfigSet object at ...>
 
 Alternatively, configuration values can be set using environment variables, e.g.
@@ -35,8 +35,8 @@ Configuration options include the following:
 
 For selecting custom implementations of codecs, pipelines, buffers and ndbuffers,
 first register the implementations in the registry and then select them in the config.
-For example, an implementation of the bytes codec in a class "custompackage.NewBytesCodec",
-requires the value of ``codecs.bytes.name`` to be "custompackage.NewBytesCodec".
+For example, an implementation of the bytes codec in a class ``'custompackage.NewBytesCodec'``,
+requires the value of ``codecs.bytes.name`` to be ``'custompackage.NewBytesCodec'``.
 
 This is the current default configuration::
 

diff --git a/docs/user-guide/consolidated_metadata.rst b/docs/user-guide/consolidated_metadata.rst
@@ -1,3 +1,7 @@
+.. only:: doctest
+
+   >>> from pprint import pprint
+
 .. _user-guide-consolidated-metadata:
 
 Consolidated metadata
@@ -29,13 +33,12 @@ attribute of the ``GroupMetadata`` object.
    >>> import zarr
    >>>
    >>> store = zarr.storage.MemoryStore()
-   >>> # TODO: replace with create_group after #2463
-   >>> group = zarr.open_group(store=store)
-   >>> group.create_array(shape=(1,), name="a")
+   >>> group = zarr.create_group(store=store)
+   >>> group.create_array(shape=(1,), name='a', dtype='float64')
    <Array memory://.../a shape=(1,) dtype=float64>
-   >>> group.create_array(shape=(2, 2), name="b")
+   >>> group.create_array(shape=(2, 2), name='b', dtype='float64')
    <Array memory://.../b shape=(2, 2) dtype=float64>
-   >>> group.create_array(shape=(3, 3, 3), name="c")
+   >>> group.create_array(shape=(3, 3, 3), name='c', dtype='float64')
    <Array memory://.../c shape=(3, 3, 3) dtype=float64>
    >>> zarr.consolidate_metadata(store)
    <Group memory://...>
@@ -45,21 +48,59 @@ that can be used.:
 
    >>> consolidated = zarr.open_group(store=store)
    >>> consolidated_metadata = consolidated.metadata.consolidated_metadata.metadata
-   >>> dict(sorted(consolidated_metadata.items()))
-   {}
+   >>> pprint(dict(sorted(consolidated_metadata.items())))
+   {'a': ArrayV3Metadata(shape=(1,),
+                          data_type=<DataType.float64: 'float64'>,
+                          chunk_grid=RegularChunkGrid(chunk_shape=(1,)),
+                          chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
+                                                                     separator='/'),
+                          fill_value=np.float64(0.0),
+                          codecs=[BytesCodec(endian=<Endian.little: 'little'>),
+                                  ZstdCodec(level=0, checksum=False)],
+                          attributes={},
+                          dimension_names=None,
+                          zarr_format=3,
+                          node_type='array',
+                          storage_transformers=()),
+     'b': ArrayV3Metadata(shape=(2, 2),
+                          data_type=<DataType.float64: 'float64'>,
+                          chunk_grid=RegularChunkGrid(chunk_shape=(2, 2)),
+                          chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
+                                                                     separator='/'),
+                          fill_value=np.float64(0.0),
+                          codecs=[BytesCodec(endian=<Endian.little: 'little'>),
+                                  ZstdCodec(level=0, checksum=False)],
+                          attributes={},
+                          dimension_names=None,
+                          zarr_format=3,
+                          node_type='array',
+                          storage_transformers=()),
+     'c': ArrayV3Metadata(shape=(3, 3, 3),
+                          data_type=<DataType.float64: 'float64'>,
+                          chunk_grid=RegularChunkGrid(chunk_shape=(3, 3, 3)),
+                          chunk_key_encoding=DefaultChunkKeyEncoding(name='default',
+                                                                     separator='/'),
+                          fill_value=np.float64(0.0),
+                          codecs=[BytesCodec(endian=<Endian.little: 'little'>),
+                                  ZstdCodec(level=0, checksum=False)],
+                          attributes={},
+                          dimension_names=None,
+                          zarr_format=3,
+                          node_type='array',
+                          storage_transformers=())}
 
 Operations on the group to get children automatically use the consolidated metadata.:
 
-   >>> consolidated["a"]  # no read / HTTP request to the Store is required
+   >>> consolidated['a']  # no read / HTTP request to the Store is required
    <Array memory://.../a shape=(1,) dtype=float64>
 
 With nested groups, the consolidated metadata is available on the children, recursively.:
 
-   >>> child = group.create_group("child", attributes={"kind": "child"})
-   >>> grandchild = child.create_group("child", attributes={"kind": "grandchild"})
+   >>> child = group.create_group('child', attributes={'kind': 'child'})
+   >>> grandchild = child.create_group('child', attributes={'kind': 'grandchild'})
    >>> consolidated = zarr.consolidate_metadata(store)
    >>>
-   >>> consolidated["child"].metadata.consolidated_metadata
+   >>> consolidated['child'].metadata.consolidated_metadata
    ConsolidatedMetadata(metadata={'child': GroupMetadata(attributes={'kind': 'grandchild'}, zarr_format=3, consolidated_metadata=ConsolidatedMetadata(metadata={}, kind='inline', must_understand=False), node_type='group')}, kind='inline', must_understand=False)
 
 Synchronization and Concurrency

diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst
@@ -1,3 +1,8 @@
+.. only:: doctest
+
+   >>> import shutil
+   >>> shutil.rmtree('data', ignore_errors=True)
+
 .. _user-guide-groups:
 
 Working with groups
@@ -10,9 +15,8 @@ support a similar interface.
 To create a group, use the :func:`zarr.group` function::
 
    >>> import zarr
-   >>>
-   >>> # TODO: replace with create_group after #2463
-   >>> root = zarr.group()
+   >>> store = zarr.storage.MemoryStore()
+   >>> root = zarr.create_group(store=store)
    >>> root
    <Group memory://...>
 
@@ -24,7 +28,7 @@ Groups have a similar API to the Group class from `h5py
 
 Groups can also contain arrays, e.g.::
 
-   >>> z1 = bar.zeros(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4')
+   >>> z1 = bar.create_array(name='baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32')
    >>> z1
    <Array memory://.../foo/bar/baz shape=(10000, 10000) dtype=int32>
 
@@ -59,7 +63,7 @@ sub-directories, e.g.::
    >>> root
    <Group file://data/group.zarr>
    >>>
-   >>> z = root.zeros(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4')
+   >>> z = root.create_array(name='foo/bar/baz', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32')
    >>> z
    <Array file://data/group.zarr/foo/bar/baz shape=(10000, 10000) dtype=int32>
 
@@ -77,12 +81,12 @@ Array and group diagnostics
 Diagnostic information about arrays and groups is available via the ``info``
 property. E.g.::
 
-   >>> # TODO: replace with create_group after #2463
-   >>> root = zarr.group()
+   >>> store = zarr.storage.MemoryStore()
+   >>> root = zarr.group(store=store)
    >>> foo = root.create_group('foo')
-   >>> bar = foo.zeros(name='bar', shape=1000000, chunks=100000, dtype='i8')
+   >>> bar = foo.create_array(name='bar', shape=1000000, chunks=100000, dtype='int64')
    >>> bar[:] = 42
-   >>> baz = foo.zeros(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='f4')
+   >>> baz = foo.create_array(name='baz', shape=(1000, 1000), chunks=(100, 100), dtype='float32')
    >>> baz[:] = 4.2
    >>> root.info
    Name        :

diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
@@ -25,4 +25,4 @@ Advanced Topics
     performance
     async
     consolidated_metadata
-    extending
+    extending
diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst
@@ -1,4 +1,9 @@
-user-guide-performance
+.. only:: doctest
+
+   >>> import shutil
+   >>> shutil.rmtree('data', ignore_errors=True)
+
+.. _user-guide-performance:
 
 Optimizing performance
 ======================
@@ -19,42 +24,41 @@ better performance, at least when using the Blosc compression library.
 The optimal chunk shape will depend on how you want to access the data. E.g.,
 for a 2-dimensional array, if you only ever take slices along the first
 dimension, then chunk across the second dimension. If you know you want to chunk
-across an entire dimension you can use ``None`` or ``-1`` within the ``chunks``
-argument, e.g.::
+across an entire dimension you can use the full size of that dimension within the
+``chunks`` argument, e.g.::
 
    >>> import zarr
-   >>>
-   >>> z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4')
+   >>> z1 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(100, 10000), dtype='int32')
    >>> z1.chunks
    (100, 10000)
 
 Alternatively, if you only ever take slices along the second dimension, then
 chunk across the first dimension, e.g.::
 
-   >>> z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4')
+   >>> z2 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 100), dtype='int32')
    >>> z2.chunks
    (10000, 100)
 
 If you require reasonable performance for both access patterns then you need to
 find a compromise, e.g.::
 
-   >>> z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4')
+   >>> z3 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(1000, 1000), dtype='int32')
    >>> z3.chunks
    (1000, 1000)
 
 If you are feeling lazy, you can let Zarr guess a chunk shape for your data by
-providing ``chunks=True``, although please note that the algorithm for guessing
+providing ``chunks='auto'``, although please note that the algorithm for guessing
 a chunk shape is based on simple heuristics and may be far from optimal. E.g.::
 
-   >>> z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4')
+   >>> z4 = zarr.create_array(store={}, shape=(10000, 10000), chunks='auto', dtype='int32')
    >>> z4.chunks
    (625, 625)
 
 If you know you are always going to be loading the entire array into memory, you
-can turn off chunks by providing ``chunks=False``, in which case there will be
-one single chunk for the array::
+can turn off chunks by providing ``chunks`` equal to ``shape``, in which case there
+will be one single chunk for the array::
 
-   >>> z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4')
+   >>> z5 = zarr.create_array(store={}, shape=(10000, 10000), chunks=(10000, 10000), dtype='int32')
    >>> z5.chunks
    (10000, 10000)
 
@@ -70,9 +74,9 @@ ratios, depending on the correlation structure within the data. E.g.::
 
    >>> import numpy as np
    >>>
-   >>> a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T
-   >>> # TODO: replace with create_array after #2463
-   >>> c = zarr.array(a, chunks=(1000, 1000))
+   >>> a = np.arange(100000000, dtype='int32').reshape(10000, 10000).T
+   >>> c = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype, config={'order': 'C'})
+   >>> c[:] = a
    >>> c.info_complete()
    Type               : Array
    Zarr format        : 3
@@ -88,7 +92,8 @@ ratios, depending on the correlation structure within the data. E.g.::
    Storage ratio      : 1.2
    Chunks Initialized : 100
    >>> with zarr.config.set({'array.order': 'F'}):
-   ...     f = zarr.array(a, chunks=(1000, 1000))
+   ...     f = zarr.create_array(store={}, shape=a.shape, chunks=(1000, 1000), dtype=a.dtype)
+   ...     f[:] = a
    >>> f.info_complete()
    Type               : Array
    Zarr format        : 3
@@ -143,15 +148,14 @@ the time required to write an array with different values.::
    ...     shape = (chunks[0] * 1024,)
    ...     data = np.random.randint(0, 255, shape)
    ...     dtype = 'uint8'
-   ...     with zarr.config.set({"array.write_empty_chunks": write_empty_chunks}):
-   ...         arr = zarr.open(
-   ...             f"data/example-{write_empty_chunks}.zarr",
-   ...             shape=shape,
-   ...             chunks=chunks,
-   ...             dtype=dtype,
-   ...             fill_value=0,
-   ...             mode='w'
-   ...          )
+   ...     arr = zarr.create_array(
+   ...         f'data/example-{write_empty_chunks}.zarr',
+   ...         shape=shape,
+   ...         chunks=chunks,
+   ...         dtype=dtype,
+   ...         fill_value=0,
+   ...         config={'write_empty_chunks': write_empty_chunks}
+   ...      )
    ...     # initialize all chunks
    ...     arr[:] = 100
    ...     result = []
@@ -208,9 +212,9 @@ to re-open any underlying files or databases upon being unpickled.
 E.g., pickle/unpickle an local store array::
 
    >>> import pickle
-   >>>
-   >>> # TODO: replace with create_array after #2463
-   >>> z1 = zarr.array(store="data/example-2", data=np.arange(100000))
+   >>> data = np.arange(100000)
+   >>> z1 = zarr.create_array(store='data/example-2.zarr', shape=data.shape, chunks=data.shape, dtype=data.dtype)
+   >>> z1[:] = data
    >>> s = pickle.dumps(z1)
    >>> z2 = pickle.loads(s)
    >>> z1 == z2