Skip to content

Commit 3c19231

Browse files
eni-awowalepre-commit-ci[bot]dcherian
authored
Adding open_groups to BackendEntryPointEngine, NetCDF4BackendEntrypoint, and H5netcdfBackendEntrypoint (#9243)
- [x] Closes #9137 and in support of #8572 - [x] Tests added - [x] User visible changes (including notable bug fixes) are documented in `whats-new.rst` - [ ] New functions/methods are listed in `api.rst` Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <[email protected]>
1 parent abd627a commit 3c19231

File tree

9 files changed

+293
-44
lines changed

9 files changed

+293
-44
lines changed

doc/whats-new.rst

+2
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,8 @@ New Features
9090
to return an object without ``attrs``. A ``deep`` parameter controls whether
9191
variables' ``attrs`` are also dropped.
9292
By `Maximilian Roos <https://github.com/max-sixty>`_. (:pull:`8288`)
93+
By `Eni Awowale <https://github.com/eni-awowale>`_.
94+
- Add `open_groups` method for unaligned datasets (:issue:`9137`, :pull:`9243`)
9395

9496
Breaking changes
9597
~~~~~~~~~~~~~~~~

xarray/backends/api.py

+37
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,43 @@ def open_datatree(
843843
return backend.open_datatree(filename_or_obj, **kwargs)
844844

845845

846+
def open_groups(
847+
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
848+
engine: T_Engine = None,
849+
**kwargs,
850+
) -> dict[str, Dataset]:
851+
"""
852+
Open and decode a file or file-like object, creating a dictionary containing one xarray Dataset for each group in the file.
853+
Useful for an HDF file ("netcdf4" or "h5netcdf") containing many groups that are not alignable with their parents
854+
and cannot be opened directly with ``open_datatree``. It is encouraged to use this function to inspect your data,
855+
then make the necessary changes to make the structure coercible to a `DataTree` object before calling `DataTree.from_dict()` and proceeding with your analysis.
856+
857+
Parameters
858+
----------
859+
filename_or_obj : str, Path, file-like, or DataStore
860+
Strings and Path objects are interpreted as a path to a netCDF file.
861+
engine : str, optional
862+
Xarray backend engine to use. Valid options include `{"netcdf4", "h5netcdf"}`.
863+
**kwargs : dict
864+
Additional keyword arguments passed to :py:func:`~xarray.open_dataset` for each group.
865+
866+
Returns
867+
-------
868+
dict[str, xarray.Dataset]
869+
870+
See Also
871+
--------
872+
open_datatree()
873+
DataTree.from_dict()
874+
"""
875+
if engine is None:
876+
engine = plugins.guess_engine(filename_or_obj)
877+
878+
backend = plugins.get_backend(engine)
879+
880+
return backend.open_groups_as_dict(filename_or_obj, **kwargs)
881+
882+
846883
def open_mfdataset(
847884
paths: str | NestedSequence[str | os.PathLike],
848885
chunks: T_Chunks | None = None,

xarray/backends/common.py

+17
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def _iter_nc_groups(root, parent="/"):
132132
from xarray.core.treenode import NodePath
133133

134134
parent = NodePath(parent)
135+
yield str(parent)
135136
for path, group in root.groups.items():
136137
gpath = parent / path
137138
yield str(gpath)
@@ -535,6 +536,22 @@ def open_datatree(
535536

536537
raise NotImplementedError()
537538

539+
def open_groups_as_dict(
540+
self,
541+
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
542+
**kwargs: Any,
543+
) -> dict[str, Dataset]:
544+
"""
545+
Opens a dictionary mapping from group names to Datasets.
546+
547+
Called by :py:func:`~xarray.open_groups`.
548+
This function exists to provide a universal way to open all groups in a file,
549+
before applying any additional consistency checks or requirements necessary
550+
to create a `DataTree` object (typically done using :py:meth:`~xarray.DataTree.from_dict`).
551+
"""
552+
553+
raise NotImplementedError()
554+
538555

539556
# mapping of engine name to (module name, BackendEntrypoint Class)
540557
BACKEND_ENTRYPOINTS: dict[str, tuple[str | None, type[BackendEntrypoint]]] = {}

xarray/backends/h5netcdf_.py

+37-13
Original file line numberDiff line numberDiff line change
@@ -448,9 +448,36 @@ def open_datatree(
448448
driver_kwds=None,
449449
**kwargs,
450450
) -> DataTree:
451-
from xarray.backends.api import open_dataset
452-
from xarray.backends.common import _iter_nc_groups
451+
453452
from xarray.core.datatree import DataTree
453+
454+
groups_dict = self.open_groups_as_dict(filename_or_obj, **kwargs)
455+
456+
return DataTree.from_dict(groups_dict)
457+
458+
def open_groups_as_dict(
459+
self,
460+
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
461+
*,
462+
mask_and_scale=True,
463+
decode_times=True,
464+
concat_characters=True,
465+
decode_coords=True,
466+
drop_variables: str | Iterable[str] | None = None,
467+
use_cftime=None,
468+
decode_timedelta=None,
469+
format=None,
470+
group: str | Iterable[str] | Callable | None = None,
471+
lock=None,
472+
invalid_netcdf=None,
473+
phony_dims=None,
474+
decode_vlen_strings=True,
475+
driver=None,
476+
driver_kwds=None,
477+
**kwargs,
478+
) -> dict[str, Dataset]:
479+
480+
from xarray.backends.common import _iter_nc_groups
454481
from xarray.core.treenode import NodePath
455482
from xarray.core.utils import close_on_error
456483

@@ -466,19 +493,19 @@ def open_datatree(
466493
driver=driver,
467494
driver_kwds=driver_kwds,
468495
)
496+
# Check for a group and make it a parent if it exists
469497
if group:
470498
parent = NodePath("/") / NodePath(group)
471499
else:
472500
parent = NodePath("/")
473501

474502
manager = store._manager
475-
ds = open_dataset(store, **kwargs)
476-
tree_root = DataTree.from_dict({str(parent): ds})
503+
groups_dict = {}
477504
for path_group in _iter_nc_groups(store.ds, parent=parent):
478505
group_store = H5NetCDFStore(manager, group=path_group, **kwargs)
479506
store_entrypoint = StoreBackendEntrypoint()
480507
with close_on_error(group_store):
481-
ds = store_entrypoint.open_dataset(
508+
group_ds = store_entrypoint.open_dataset(
482509
group_store,
483510
mask_and_scale=mask_and_scale,
484511
decode_times=decode_times,
@@ -488,14 +515,11 @@ def open_datatree(
488515
use_cftime=use_cftime,
489516
decode_timedelta=decode_timedelta,
490517
)
491-
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
492-
tree_root._set_item(
493-
path_group,
494-
new_node,
495-
allow_overwrite=False,
496-
new_nodes_along_path=True,
497-
)
498-
return tree_root
518+
519+
group_name = str(NodePath(path_group))
520+
groups_dict[group_name] = group_ds
521+
522+
return groups_dict
499523

500524

501525
BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint)

xarray/backends/netCDF4_.py

+35-13
Original file line numberDiff line numberDiff line change
@@ -688,9 +688,34 @@ def open_datatree(
688688
autoclose=False,
689689
**kwargs,
690690
) -> DataTree:
691-
from xarray.backends.api import open_dataset
692-
from xarray.backends.common import _iter_nc_groups
691+
693692
from xarray.core.datatree import DataTree
693+
694+
groups_dict = self.open_groups_as_dict(filename_or_obj, **kwargs)
695+
696+
return DataTree.from_dict(groups_dict)
697+
698+
def open_groups_as_dict(
699+
self,
700+
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
701+
*,
702+
mask_and_scale=True,
703+
decode_times=True,
704+
concat_characters=True,
705+
decode_coords=True,
706+
drop_variables: str | Iterable[str] | None = None,
707+
use_cftime=None,
708+
decode_timedelta=None,
709+
group: str | Iterable[str] | Callable | None = None,
710+
format="NETCDF4",
711+
clobber=True,
712+
diskless=False,
713+
persist=False,
714+
lock=None,
715+
autoclose=False,
716+
**kwargs,
717+
) -> dict[str, Dataset]:
718+
from xarray.backends.common import _iter_nc_groups
694719
from xarray.core.treenode import NodePath
695720

696721
filename_or_obj = _normalize_path(filename_or_obj)
@@ -704,19 +729,20 @@ def open_datatree(
704729
lock=lock,
705730
autoclose=autoclose,
706731
)
732+
733+
# Check for a group and make it a parent if it exists
707734
if group:
708735
parent = NodePath("/") / NodePath(group)
709736
else:
710737
parent = NodePath("/")
711738

712739
manager = store._manager
713-
ds = open_dataset(store, **kwargs)
714-
tree_root = DataTree.from_dict({str(parent): ds})
740+
groups_dict = {}
715741
for path_group in _iter_nc_groups(store.ds, parent=parent):
716742
group_store = NetCDF4DataStore(manager, group=path_group, **kwargs)
717743
store_entrypoint = StoreBackendEntrypoint()
718744
with close_on_error(group_store):
719-
ds = store_entrypoint.open_dataset(
745+
group_ds = store_entrypoint.open_dataset(
720746
group_store,
721747
mask_and_scale=mask_and_scale,
722748
decode_times=decode_times,
@@ -726,14 +752,10 @@ def open_datatree(
726752
use_cftime=use_cftime,
727753
decode_timedelta=decode_timedelta,
728754
)
729-
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
730-
tree_root._set_item(
731-
path_group,
732-
new_node,
733-
allow_overwrite=False,
734-
new_nodes_along_path=True,
735-
)
736-
return tree_root
755+
group_name = str(NodePath(path_group))
756+
groups_dict[group_name] = group_ds
757+
758+
return groups_dict
737759

738760

739761
BACKEND_ENTRYPOINTS["netcdf4"] = ("netCDF4", NetCDF4BackendEntrypoint)

xarray/backends/plugins.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def get_backend(engine: str | type[BackendEntrypoint]) -> BackendEntrypoint:
193193
engines = list_engines()
194194
if engine not in engines:
195195
raise ValueError(
196-
f"unrecognized engine {engine} must be one of: {list(engines)}"
196+
f"unrecognized engine {engine} must be one of your download engines: {list(engines)}"
197197
"To install additional dependencies, see:\n"
198198
"https://docs.xarray.dev/en/stable/user-guide/io.html \n"
199199
"https://docs.xarray.dev/en/stable/getting-started-guide/installing.html"

xarray/core/datatree.py

+7-15
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,9 @@
99
Iterable,
1010
Iterator,
1111
Mapping,
12-
MutableMapping,
1312
)
1413
from html import escape
15-
from typing import (
16-
TYPE_CHECKING,
17-
Any,
18-
Generic,
19-
Literal,
20-
NoReturn,
21-
Union,
22-
overload,
23-
)
14+
from typing import TYPE_CHECKING, Any, Generic, Literal, NoReturn, Union, overload
2415

2516
from xarray.core import utils
2617
from xarray.core.alignment import align
@@ -776,7 +767,7 @@ def _replace_node(
776767
if data is not _default:
777768
self._set_node_data(ds)
778769

779-
self._children = children
770+
self.children = children
780771

781772
def copy(
782773
self: DataTree,
@@ -1073,7 +1064,7 @@ def drop_nodes(
10731064
@classmethod
10741065
def from_dict(
10751066
cls,
1076-
d: MutableMapping[str, Dataset | DataArray | DataTree | None],
1067+
d: Mapping[str, Dataset | DataArray | DataTree | None],
10771068
name: str | None = None,
10781069
) -> DataTree:
10791070
"""
@@ -1101,7 +1092,8 @@ def from_dict(
11011092
"""
11021093

11031094
# First create the root node
1104-
root_data = d.pop("/", None)
1095+
d_cast = dict(d)
1096+
root_data = d_cast.pop("/", None)
11051097
if isinstance(root_data, DataTree):
11061098
obj = root_data.copy()
11071099
obj.orphan()
@@ -1112,10 +1104,10 @@ def depth(item) -> int:
11121104
pathstr, _ = item
11131105
return len(NodePath(pathstr).parts)
11141106

1115-
if d:
1107+
if d_cast:
11161108
# Populate tree with children determined from data_objects mapping
11171109
# Sort keys by depth so as to insert nodes from root first (see GH issue #9276)
1118-
for path, data in sorted(d.items(), key=depth):
1110+
for path, data in sorted(d_cast.items(), key=depth):
11191111
# Create and set new node
11201112
node_name = NodePath(path).name
11211113
if isinstance(data, DataTree):

0 commit comments

Comments
 (0)