From 583b912454f26f62134a3b99803a2ff80941a9ef Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Wed, 15 Jan 2025 18:05:51 +0000 Subject: [PATCH] Added crude dimension-based load chunking control. --- lib/ncdata/netcdf4.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/lib/ncdata/netcdf4.py b/lib/ncdata/netcdf4.py index d70effa..8c77ff2 100644 --- a/lib/ncdata/netcdf4.py +++ b/lib/ncdata/netcdf4.py @@ -208,9 +208,7 @@ def to_nc4( nc4ds.close() -def _from_nc4_group( - nc4ds: Union[nc.Dataset, nc.Group], -) -> NcData: +def _from_nc4_group(nc4ds: Union[nc.Dataset, nc.Group], dim_chunks) -> NcData: """ Inner routine for :func:`from_nc4`. @@ -261,8 +259,9 @@ def _from_nc4_group( variable_name=varname, group_names_path=group_names_path, ) + chunks = [dim_chunks.get(name, "auto") for name in var.dimensions] var.data = da.from_array( - proxy, chunks="auto", asarray=True, meta=np.ndarray + proxy, chunks=chunks, asarray=True, meta=np.ndarray ) for attrname in nc4var.ncattrs(): @@ -277,13 +276,16 @@ def _from_nc4_group( # And finally, groups -- by the magic of recursion ... for group_name, group in nc4ds.groups.items(): - ncdata.groups[group_name] = _from_nc4_group(nc4ds.groups[group_name]) + ncdata.groups[group_name] = _from_nc4_group( + nc4ds.groups[group_name], dim_chunks=dim_chunks + ) return ncdata def from_nc4( - nc4_dataset_or_file: Union[nc.Dataset, nc.Group, Path, str] + nc4_dataset_or_file: Union[nc.Dataset, nc.Group, Path, str], + dim_chunks: Dict[str, Union[int, str]] = None, ) -> NcData: """ Load NcData from a :class:`netCDF4.Dataset` or netCDF file. @@ -294,10 +296,31 @@ def from_nc4( source of load data. Can be either a :class:`netCDF4.Dataset`, a :class:`netCDF4.Group`, a :class:`pathlib.Path` or a string. + dim_chunks + a dictionary of chunk sizes (number, or -1 or "auto") for specific + dimensions, specified by dimension name. + Defaults to "auto" for all unspecified dimensions. + Returns ------- ncdata : NcData + + Examples + -------- + For example, to avoid cases where a simple dask ``from_array(chunks="auto")`` + will fail + + >>> from ncdata.netcdf4 import from_nc4 + >>> from tests import testdata_dir + >>> path = testdata_dir / "toa_brightness_temperature.nc" + >>> ds = from_nc4(path, dim_chunks={"x": 15}) + >>> ds.variables["data"].data.chunksize + (160, 15) + >>> + """ + if dim_chunks is None: + dim_chunks = {} caller_owns_dataset = hasattr(nc4_dataset_or_file, "variables") if caller_owns_dataset: nc4ds = nc4_dataset_or_file @@ -305,7 +328,7 @@ def from_nc4( nc4ds = nc.Dataset(nc4_dataset_or_file) try: - ncdata = _from_nc4_group(nc4ds) + ncdata = _from_nc4_group(nc4ds, dim_chunks) finally: if not caller_owns_dataset: nc4ds.close()