Allow using ... as a key in chunk specifications.

shoyer · Xarray-Beam authors · commit a3d29f8a87cd · 2025-10-02T16:28:21.000-07:00
This change enables specifying a default chunk size for all dimensions not explicitly listed in the `chunks` mapping by using `...` as a key. For example, `{'x': 10, ...: 20}` will chunk dimension 'x' into sizes of 10 and all other dimensions into sizes of 20.

PiperOrigin-RevId: 814430585
diff --git a/examples/xbeam_rechunk.py b/examples/xbeam_rechunk.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Rechunk a Zarr dataset."""
+import types
+
 from absl import app
 from absl import flags
 import apache_beam as beam
@@ -51,22 +53,38 @@
 # pylint: disable=expression-not-assigned
 
 
-def _parse_chunks_str(chunks_str: str) -> dict[str, int]:
+def _try_to_int(chunks_str: str) -> int | str:
+  try:
+    return int(chunks_str)
+  except ValueError:
+    return chunks_str
+
+
+def _parse_chunks_flag(
+    chunks_str: str,
+) -> dict[str | types.EllipsisType, int | str] | int | str:
+  """Parse a string representation of unnormalized chunks."""
+  if '=' not in chunks_str:
+    return _try_to_int(chunks_str)
+
   chunks = {}
   parts = chunks_str.split(',')
   for part in parts:
     k, v = part.split('=')
-    chunks[k] = int(v)
+    if k == '...':
+      k = ...
+    chunks[k] = _try_to_int(v)
   return chunks
 
 
 def main(argv):
-  target_chunks = _parse_chunks_str(TARGET_CHUNKS.value)
+  target_chunks = _parse_chunks_flag(TARGET_CHUNKS.value)
 
-  if TARGET_SHARDS.value is not None:
-    target_shards = _parse_chunks_str(TARGET_SHARDS.value)
-  else:
-    target_shards = None
+  target_shards = (
+      _parse_chunks_flag(TARGET_SHARDS.value)
+      if TARGET_SHARDS.value is not None
+      else None
+  )
 
   with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
     root |= (
diff --git a/examples/xbeam_rechunk_test.py b/examples/xbeam_rechunk_test.py
@@ -16,9 +16,9 @@
 from absl.testing import absltest
 from absl.testing import flagsaver
 import xarray
+from xarray_beam._src import test_util
 
 from . import xbeam_rechunk
-from xarray_beam._src import test_util
 
 
 class Era5RechunkTest(test_util.TestCase):
@@ -27,20 +27,25 @@ def test_chunks_only(self):
     input_path = self.create_tempdir('source').full_path
     output_path = self.create_tempdir('destination').full_path
 
-    input_ds = test_util.dummy_era5_surface_dataset(times=365)
+    input_ds = test_util.dummy_era5_surface_dataset(
+        latitudes=100, longitudes=200, times=365
+    )
     input_ds.chunk({'time': 31}).to_zarr(input_path)
 
     with flagsaver.flagsaver(
         input_path=input_path,
         output_path=output_path,
-        target_chunks='latitude=5,longitude=5,time=-1',
+        target_chunks=f'time=-1,...={365*10*20*4}B',
     ):
       xbeam_rechunk.main([])
 
     output_ds = xarray.open_zarr(output_path)
+    # dask.array tries to preserve the aspect ratio of the original array when
+    # splitting across dimensions, hence the 2x ratio between latitude and
+    # longitude.
     self.assertEqual(
         {k: v[0] for k, v in output_ds.chunks.items()},
-        {'latitude': 5, 'longitude': 5, 'time': 365}
+        {'latitude': 10, 'longitude': 20, 'time': 365},
     )
     xarray.testing.assert_identical(input_ds, output_ds)
 
@@ -63,7 +68,7 @@ def test_chunks_and_shards(self):
     output_ds = xarray.open_zarr(output_path)
     self.assertEqual(
         {k: v[0] for k, v in output_ds.chunks.items()},
-        {'latitude': 5, 'longitude': 5, 'time': 365}
+        {'latitude': 5, 'longitude': 5, 'time': 365},
     )
     actual_shards = {k: v.encoding['shards'] for k, v in output_ds.items()}
     expected_shards = {k: (365, 10, 10) for k, v in output_ds.items()}
diff --git a/xarray_beam/__init__.py b/xarray_beam/__init__.py
@@ -55,4 +55,4 @@
     DatasetToZarr as DatasetToZarr,
 )
 
-__version__ = '0.10.2'  # automatically synchronized to pyproject.toml
+__version__ = '0.10.3'  # automatically synchronized to pyproject.toml
diff --git a/xarray_beam/_src/dataset.py b/xarray_beam/_src/dataset.py
@@ -38,6 +38,7 @@
 import os.path
 import tempfile
 import textwrap
+import types
 from typing import Callable, Literal
 
 import apache_beam as beam
@@ -71,16 +72,19 @@ def _to_human_size(nbytes: int) -> str:
   return f'{_at_least_two_digits(nbytes)}EB'
 
 
+UnnormalizedChunks = Mapping[str | types.EllipsisType, int | str] | int | str
+
+
 def normalize_chunks(
-    chunks: Mapping[str, int | str] | str,
+    chunks: UnnormalizedChunks,
     template: xarray.Dataset,
     split_vars: bool = False,
     previous_chunks: Mapping[str, int] | None = None,
 ) -> dict[str, int]:
   """Normalize chunks for a xarray.Dataset.
 
-  This function interprets various chunk specifications (e.g., -1, 'auto',
-  byte-strings) and returns a dictionary mapping dimension names to
+  This function interprets various chunk specifications (e.g., integer sizes or
+  numbers of bytes) and returns a dictionary mapping dimension names to
   concrete integer chunk sizes. It uses ``dask.array.api.normalize_chunks``
   under the hood.
 
@@ -89,19 +93,28 @@ def normalize_chunks(
       dimension.
     - An integer: the exact chunk size for this dimension.
     - A byte-string (e.g., "64MiB", "1GB"): indicates that dask should pick
-      chunk sizes to aim for chunks of approximately this size. If byte limits
-      are specified for multiple dimensions, they must be consistent (i.e.,
-      parse to the same number of bytes).
-    - ``'auto'``: chunks will be automatically determined for all 'auto'
-      dimensions to ensure chunks are approximately the target number of bytes
-      (defaulting to 128MiB, if no byte limits are specified).
+      chunk sizes to aim for chunks of approximately this size.
+
+  Only a single string value indicating a number of bytes can be specified. To
+  indicate that chunking applies to multiple dimensions, use a dict key of
+  ``...``.
+
+  Some examples:
+    - ``chunks={'time': 100}``: Each chunk will have exactly 100 elements along
+      the 'time' dimension.
+    - ``chunks="200MB"``: Create chunks that are approximately 200MB in size.
+    - ``chunks={'time': -1, ...: "100MB"}``: Chunks should include the full
+      'time' dimension, and be chunked along other dimensions such that
+      resulting chunks are approximately 100MiB in size.
 
   Args:
     chunks: The desired chunking scheme. Can either be a dictionary mapping
-      dimension names to chunk sizes, or a single string chunk specification
-      (e.g., 'auto' or '100MiB') to be applied as the default for all
+      dimension names to chunk sizes, or a single string/integer chunk
+      specification (e.g., '100MB') to be applied as the default for all
       dimensions. Dimensions not included in the dictionary default to
-      previous_chunks (if available) or the full size of the dimension.
+      ``previous_chunks`` (if available) or the full size of the dimension. A
+      dict key of ellipsis (...) can also be used to indicate "all other
+      dimensions".
     template: An xarray.Dataset providing dimension sizes and dtype information,
       used for calculating chunk sizes in bytes.
     split_vars: If True, chunk size limits are applied per-variable, based on
@@ -113,15 +126,34 @@ def normalize_chunks(
   Returns:
     A dictionary mapping all dimension names to integer chunk sizes.
   """
-  if isinstance(chunks, str):
+  raw_chunks = chunks
+
+  if isinstance(chunks, str | int):
+    if chunks == 'auto':
+      raise ValueError(
+          'Unlike Dask, xarray_beam.normalize_chunks() does not support '
+          "chunks='auto'. Supply an explicit number of bytes instead, e.g., "
+          "chunks='100MB'."
+      )
     chunks = {k: chunks for k in template.dims}
+  elif isinstance(chunks, Mapping):
+    string_chunks = {v for v in chunks.values() if isinstance(v, str)}
+    if len(string_chunks) > 1:
+      raise ValueError(
+          f'cannot provide multiple distinct chunk sizes in bytes: {chunks}'
+      )
+    if any(v == 'auto' for v in chunks.values()):
+      raise ValueError(
+          'Unlike Dask, xarray_beam.normalize_chunks() does not support '
+          "'auto' chunk sizes. Supply an explicit number of bytes instead, "
+          f"e.g., '100MB'. Got {chunks=}"
+      )
+  else:
+    raise TypeError(f'chunks must be a string or a mapping, got {chunks=}')
 
-  string_chunks = {v for v in chunks.values() if isinstance(v, str)}
-  string_chunks.discard('auto')
-  if len(string_chunks) > 1:
-    raise ValueError(
-        f'cannot specify multiple distinct chunk sizes in bytes: {chunks}'
-    )
+  if ... in chunks:
+    default_chunks = chunks[...]
+    chunks = {k: chunks.get(k, default_chunks) for k in template.dims}
 
   defaults = previous_chunks if previous_chunks else template.sizes
   chunks: dict[str, int | str] = {**defaults, **chunks}  # pytype: disable=annotation-type-mismatch
@@ -142,19 +174,22 @@ def normalize_chunks(
       tuple(previous_chunks[k] for k in chunks) if previous_chunks else None
   )
 
-  # Note: This values are the same as the dask defaults. Set them explicitly
-  # here to ensure that Xarray-Beam behavior does not depend on the user's
-  # dask configuration.
-  with dask.config.set({
-      'array.chunk-size': '128MiB',
-      'array.chunk-size-tolerance': 1.25,
-  }):
-    normalized_chunks_tuple = dask.array.api.normalize_chunks(
-        chunks_tuple,
-        shape,
-        dtype=combined_dtype,
-        previous_chunks=prev_chunks_tuple,
-    )
+  # Note: This is the same as the dask default. Set chunk-size-tolerance
+  # explicitly here to ensure that Xarray-Beam behavior does not depend on the
+  # user's dask configuration.
+  with dask.config.set({'array.chunk-size-tolerance': 1.25}):
+    try:
+      normalized_chunks_tuple = dask.array.api.normalize_chunks(
+          chunks_tuple,
+          shape,
+          dtype=combined_dtype,
+          previous_chunks=prev_chunks_tuple,
+      )
+    except ValueError as e:
+      raise ValueError(
+          f'Invalid input for normalize_chunks: chunks={raw_chunks!r}, '
+          f'{previous_chunks=}, {template=}'
+      ) from e
   return {k: v[0] for k, v in zip(chunks, normalized_chunks_tuple)}
 
 
@@ -282,7 +317,9 @@ def __init__(
         this dataset's data.
     """
     self._template = template
-    self._chunks = chunks
+    self._chunks = {
+        k: min(template.sizes[k], v) for k, v in chunks.items()
+    }
     self._split_vars = split_vars
     self._ptransform = ptransform
 
@@ -357,7 +394,7 @@ def __repr__(self):
   def from_xarray(
       cls,
       source: xarray.Dataset,
-      chunks: Mapping[str, int | str] | str,
+      chunks: UnnormalizedChunks,
       *,
       split_vars: bool = False,
       previous_chunks: Mapping[str, int] | None = None,
@@ -384,7 +421,7 @@ def from_zarr(
       cls,
       path: str,
       *,
-      chunks: Mapping[str, int | str] | str | None = None,
+      chunks: UnnormalizedChunks | None = None,
       split_vars: bool = False,
   ) -> Dataset:
     """Create an xarray_beam.Dataset from a Zarr store.
@@ -426,8 +463,8 @@ def to_zarr(
       path: str,
       *,
       zarr_chunks_per_shard: Mapping[str, int] | None = None,
-      zarr_chunks: Mapping[str, int] | None = None,
-      zarr_shards: Mapping[str, int] | None = None,
+      zarr_chunks: UnnormalizedChunks | None = None,
+      zarr_shards: UnnormalizedChunks | None = None,
       zarr_format: int | None = None,
   ) -> beam.PTransform:
     """Write this dataset to a Zarr file.
@@ -461,14 +498,21 @@ def to_zarr(
     Returns:
       Beam PTransform that writes the dataset to a Zarr file.
     """
+    if zarr_shards is not None:
+      zarr_shards = normalize_chunks(
+          zarr_shards,
+          self.template,
+          split_vars=self.split_vars,
+          previous_chunks=self.chunks,
+      )
+
     if zarr_chunks_per_shard is not None:
       if zarr_chunks is not None:
         raise ValueError(
             'cannot supply both zarr_chunks_per_shard and zarr_chunks'
         )
       if zarr_shards is None:
-        zarr_shards = {}
-      zarr_shards = {**self.chunks, **zarr_shards}
+        zarr_shards = self.chunks
       zarr_chunks = {}
       for dim, existing_chunk_size in zarr_shards.items():
         multiple = zarr_chunks_per_shard.get(dim)
@@ -490,9 +534,13 @@ def to_zarr(
         raise ValueError('cannot supply zarr_shards without zarr_chunks')
       zarr_chunks = {}
 
-    zarr_chunks = {**self.chunks, **zarr_chunks}
+    zarr_chunks = normalize_chunks(
+        zarr_chunks,
+        self.template,
+        split_vars=self.split_vars,
+        previous_chunks=self.chunks,
+    )
     if zarr_shards is not None:
-      zarr_shards = {**self.chunks, **zarr_shards}
       self._check_shards_or_chunks(zarr_shards, 'shards')
     else:
       self._check_shards_or_chunks(zarr_chunks, 'chunks')
@@ -537,9 +585,9 @@ def map_blocks(
         attempt will be made to infer the template by applying ``func`` to the
         existing template, which requires that ``func`` is implemented using
         dask compatible operations.
-      chunks: new chunks sizes for the resulting dataset. If not provided, an
-        attempt will be made to infer the new chunks based on the existing
-        chunks, dimensions sizes and the new template.
+      chunks: explicit new chunks sizes created by applying ``func``. If not
+        provided, an attempt will be made to infer the new chunks based on the
+        existing chunks, dimensions sizes and the new template.
 
     Returns:
       New Dataset with updated chunks.
@@ -587,7 +635,7 @@ def map_blocks(
 
   def rechunk(
       self,
-      chunks: dict[str, int | str] | str,
+      chunks: UnnormalizedChunks,
       min_mem: int | None = None,
       max_mem: int = 2**30,
   ) -> Dataset:
diff --git a/xarray_beam/_src/dataset_test.py b/xarray_beam/_src/dataset_test.py

Original file line number	Diff line number	Diff line change
`@@ -55,4 +55,4 @@`
`55`	`55`	`DatasetToZarr as DatasetToZarr,`
`56`	`56`	`)`
`57`	`57`
`58`		`-__version__ = '0.10.2' # automatically synchronized to pyproject.toml`
	`58`	`+__version__ = '0.10.3' # automatically synchronized to pyproject.toml`