diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 62bf08b366f..074ebf611ca 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -20,6 +20,7 @@ conda uninstall -y --force \ bottleneck \ sparse \ flox \ + xarray-datatree \ h5netcdf \ xarray # to limit the runtime of Upstream CI @@ -47,5 +48,6 @@ python -m pip install \ git+https://github.com/intake/filesystem_spec \ git+https://github.com/SciTools/nc-time-axis \ git+https://github.com/xarray-contrib/flox \ + git+https://github.com/xarray-contrib/xarray-datatree \ git+https://github.com/h5netcdf/h5netcdf python -m pip install pytest-timeout diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml index ce819640c76..373aa4e0f73 100644 --- a/ci/requirements/all-but-dask.yml +++ b/ci/requirements/all-but-dask.yml @@ -41,4 +41,5 @@ dependencies: - sparse - toolz - typing_extensions + - xarray-datatree - zarr diff --git a/ci/requirements/environment-py311.yml b/ci/requirements/environment-py311.yml index e23fa44c683..8f5852e03d3 100644 --- a/ci/requirements/environment-py311.yml +++ b/ci/requirements/environment-py311.yml @@ -45,4 +45,5 @@ dependencies: # - sparse - toolz - typing_extensions + - xarray-datatree - zarr diff --git a/ci/requirements/environment-windows-py311.yml b/ci/requirements/environment-windows-py311.yml index 3fc207dc609..3c77fa38b1d 100644 --- a/ci/requirements/environment-windows-py311.yml +++ b/ci/requirements/environment-windows-py311.yml @@ -41,4 +41,5 @@ dependencies: # - sparse - toolz - typing_extensions + - xarray-datatree - zarr diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 0941af474f7..4a33c20ca2a 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -41,4 +41,5 @@ dependencies: - sparse - toolz - typing_extensions + - xarray-datatree - zarr diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index e87e69138ee..f59e0934551 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -46,3 +46,5 @@ dependencies: - toolz - typing_extensions - zarr + - pip: + - git+https://github.com/xarray-contrib/datatree diff --git a/doc/api.rst b/doc/api.rst index 0d56fc73997..a253c53b07b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1133,6 +1133,20 @@ used filetypes in the xarray universe. backends.StoreBackendEntrypoint backends.ZarrBackendEntrypoint +DataTree +======== + +Experimental API for handling nested groups of data. +Requires the `xarray-datatree package `_ to be installed. +See the `datatree documentation `_ for details. + +.. autosummary:: + :toctree: generated/ + + DataTree + open_datatree + register_datatree_accessor + Deprecated / Pending Deprecation ================================ diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 9e5d7ad71a3..eb091871256 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -156,6 +156,9 @@ to the original netCDF file, regardless if they exist in the original dataset. Groups ~~~~~~ +Single groups as datasets +......................... + NetCDF groups are not supported as part of the :py:class:`Dataset` data model. Instead, groups can be loaded individually as Dataset objects. To do so, pass a ``group`` keyword argument to the @@ -228,10 +231,34 @@ Either of these groups can be loaded from the file as an independent :py:class:` Data variables: b int64 ... -.. note:: +.. _io.netcdf_datatree_groups: + +Multiple Groups as a DataTree +............................. + +For native handling of multiple groups with xarray, including I/O, you might be interested in the experimental +`xarray-datatree `_ package. +If installed, this package's API can be imported directly from xarray, i.e. ``from xarray import DataTree``. + +Whilst netCDF groups can only be loaded individually as Dataset objects, a whole file of many nested groups can be loaded +as a single :py:class:`DataTree` object. +To open a whole netCDF file as a tree of groups use the :py:func:`open_datatree()` function. +To save a DataTree object as a netCDF file containing many groups, use the :py:meth:`DataTree.to_netcdf()`` method. + +.. _netcdf.group.warning: + +.. warning:: + ``DataTree`` objects do not follow the exact same data model as netCDF files, which means that perfect round-tripping + is not always possible. + + In particular in the netCDF data model dimensions are entities that can exist regardless of whether any variable possesses them. + This is in contrast to `xarray's data model `_ + (and hence `datatree's data model `_) in which the dimensions of a (Dataset/Tree) + object are simply the set of dimensions present across all variables in that dataset. - For native handling of multiple groups with xarray, including I/O, you might be interested in the experimental - `xarray-datatree `_ package. + This means that if a netCDF file contains dimensions but no variables which possess those dimensions, + these dimensions will not be present when that file is opened as a DataTree object. + Saving this DataTree object to file will therefore not preserve these "unused" dimensions. .. _io.encoding: @@ -633,6 +660,21 @@ To read back a zarr dataset that has been created this way, we use the ds_zarr = xr.open_zarr("path/to/directory.zarr") ds_zarr +Groups +~~~~~~ + +Like for netCDF, zarr groups can either be opened as individual :py:class:`Dataset` objects using the ``group`` keyword argument to :py:func:`open_dataset`, +or alternatively nested groups in zarr stores can be represented by loading the store as a :py:class:`DataTree` object. +(The latter option requires that you have the `xarray-datatree `_ package installed.) + +To open a whole zarr store as a tree of groups use the :py:func:`open_datatree()` function. +To save a DataTree object as a zarr store containing many groups, use the :py:meth:`DataTree.to_zarr()` method. + +.. note:: + Note that perfect round-tripping should always be possible with a zarr store (:ref:`unlike for netCDF files`), + as zarr does not support "unused" dimensions. + + Cloud Storage Buckets ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 26bd72b0727..27337a2f8e1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,15 @@ v2023.01.1 (unreleased) New Features ~~~~~~~~~~~~ +- Allow importing the prototype :py:class:`DataTree` class (as well as the accompanying :py:func:`open_datatree()` and :py:func:`register_datatree_accessor` functions). + Currently ``from xarray import DataTree`` disguises an import from a separate package ``xarray-contrib/xarray-datatree``. + Importing these features will raise an ``ImportError`` unless the datatree package is installed. + Full integration of the :py:class:`DataTree` class in xarray is planned in the future (see our development roadmap), + but for now is proceeding on a provisional basis, and as such the API is still experimental and subject to change without notice. + In the meantime, you are encouraged to try using these features, and please let us know about your experiences! + (:issue:`4118`, :pull:`7418`) + By `Tom Nicholas `_. + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/__init__.py b/xarray/__init__.py index d064502c20b..9585172579c 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -52,6 +52,12 @@ # Disable minimum version checks on downstream libraries. __version__ = "999" +try: + from datatree import DataTree, register_datatree_accessor, open_datatree # noqa +except ImportError: + pass + + # A hardcoded __all__ variable is necessary to appease # `mypy --strict` running in projects that import xarray. __all__ = ( diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 8fefef5aff7..6ad6b0f3e54 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3656,6 +3656,48 @@ def reduce( var = self.variable.reduce(func, dim, axis, keep_attrs, keepdims, **kwargs) return self._replace_maybe_drop_dims(var) + def to_datatree(self, node_name: str | None = None, name: str | None = None): + """ + Convert this dataarray into a datatree.DataTree. + + WARNING: The DataTree structure is considered experimental, + and the API is less solidified than for other xarray features. + + The returned tree will only consist of a single node. + That node will contain a copy of the dataarray's data, + meaning including its coordinates, dimensions and attributes. + + Requires the xarray-datatree package to be installed. + Find it at https://github.com/xarray-contrib/datatree. + + Parameters + ---------- + node_name: str, optional + The name of the datatree node created. + name: str, optional + Name to substitute for this array's name. + + Returns + ------- + dt : DataTree + A single-node datatree object, containing the information from this dataarray. + + See Also + -------- + datatree.DataTree + """ + + try: + from datatree import DataTree + except ImportError: + raise ImportError( + "Could not import the datatree package. " + "Find it at https://github.com/xarray-contrib/datatree" + ) + + ds = self.to_dataset(name=name) + return DataTree(data=ds, name=node_name) + def to_pandas(self) -> DataArray | pd.Series | pd.DataFrame: """Convert this array into a pandas object with the same shape. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7e672bbcc06..5b1d857b959 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6116,6 +6116,45 @@ def to_array( return DataArray._construct_direct(variable, coords, name, indexes) + def to_datatree(self, node_name: str | None = None): + """ + Convert this dataset into a datatree.DataTree. + + .. warning:: The DataTree structure is considered experimental, + and the API is less solidified than for other xarray features. + + The returned tree will only consist of a single node. + That node will contain a copy of the dataset's data, + meaning all variables, coordinates, dimensions and attributes. + + Requires the xarray-datatree package to be installed. + Find it at https://github.com/xarray-contrib/datatree. + + Parameters + ---------- + node_name: str, optional + The name of the datatree node created. + + Returns + ------- + dt : DataTree + A single-node datatree object, containing the information from this dataset. + + See Also + -------- + datatree.DataTree + """ + + try: + from datatree import DataTree + except ImportError: + raise ImportError( + "Could not import the datatree package. " + "Find it at https://github.com/xarray-contrib/datatree" + ) + + return DataTree(data=self, name=node_name) + def _normalize_dim_order( self, dim_order: Sequence[Hashable] | None = None ) -> dict[Hashable, int]: diff --git a/xarray/core/types.py b/xarray/core/types.py index fc3c6712be2..edc71dcdba5 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -28,6 +28,11 @@ from xarray.core.indexes import Index from xarray.core.variable import Variable + try: + from datatree import DataTree as T_DataTree + except ImportError: + T_DataTree = Any + try: from dask.array import Array as DaskArray except ImportError: diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 9ff8de74ba3..00fdec8eb68 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -82,6 +82,7 @@ def _importorskip( has_pint, requires_pint = _importorskip("pint") has_numexpr, requires_numexpr = _importorskip("numexpr") has_flox, requires_flox = _importorskip("flox") +has_datatree, requires_datatree = _importorskip("datatree") # some special cases diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py new file mode 100644 index 00000000000..982a6b2f7be --- /dev/null +++ b/xarray/tests/test_datatree.py @@ -0,0 +1,29 @@ +import pytest + +import xarray.testing as xrt +from xarray import Dataset, DataTree + +pytest.importorskip("datatree") + + +def test_import_datatree(): + """Just test importing datatree package from xarray-contrib repo""" + + DataTree() + + +def test_to_datatree(): + + ds = Dataset({"a": ("x", [1, 2, 3])}) + dt = ds.to_datatree(node_name="group1") + + assert isinstance(dt, DataTree) + assert dt.name == "group1" + xrt.assert_identical(dt.to_dataset(), ds) + + da = ds["a"] + dt = da.to_datatree(node_name="group1") + + assert isinstance(dt, DataTree) + assert dt.name == "group1" + xrt.assert_identical(dt["a"], da)