diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 78ef2875b31..96d965ed347 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -15,6 +15,8 @@ New Features - :py:meth:`DataTree.to_netcdf` can now write to a file-like object, or return bytes if called without a filepath. (:issue:`10570`) By `Matthew Willson `_. +- Added exception handling for invalid files in :py:func:`open_mfdataset`. (:issue:`6736`) + By `Pratiman Patel `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2a6476ea828..b20af13e628 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -19,6 +19,7 @@ Any, Final, Literal, + TypeVar, Union, cast, overload, @@ -45,8 +46,8 @@ from xarray.core.datatree import DataTree from xarray.core.indexes import Index from xarray.core.treenode import group_subtrees -from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -from xarray.core.utils import is_remote_uri +from xarray.core.types import NetcdfWriteModes, ReadBuffer, ZarrWriteModes +from xarray.core.utils import emit_user_level_warning, is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.structure.chunks import _get_chunk, _maybe_chunk @@ -73,6 +74,7 @@ from xarray.core.types import ( CombineAttrsOptions, CompatOptions, + ErrorOptionsWithWarn, JoinOptions, NestedSequence, ReadBuffer, @@ -1459,6 +1461,28 @@ def open_groups( return groups +_FLike = TypeVar("_FLike", bound=Union[str, ReadBuffer]) + + +def _remove_path( + paths: NestedSequence[_FLike], paths_to_remove: set[_FLike] +) -> NestedSequence[_FLike]: + # Initialize an empty list to store the result + result: list[Union[_FLike, NestedSequence[_FLike]]] = [] + + for item in paths: + if isinstance(item, list): + # If the current item is a list, recursively call remove_elements on it + nested_result = _remove_path(item, paths_to_remove) + if nested_result: # Only add non-empty lists to avoid adding empty lists + result.append(nested_result) + elif item not in paths_to_remove: + # Add the item to the result if it is not in the set of elements to remove + result.append(item) + + return result + + def open_mfdataset( paths: str | os.PathLike @@ -1487,6 +1511,7 @@ def open_mfdataset( join: JoinOptions | CombineKwargDefault = _JOIN_DEFAULT, attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", + errors: ErrorOptionsWithWarn = "raise", **kwargs, ) -> Dataset: """Open multiple files as a single dataset. @@ -1613,6 +1638,12 @@ def open_mfdataset( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. + errors : {"raise", "warn", "ignore"}, default: "raise" + String indicating how to handle errors in opening dataset. + + - "raise": invalid dataset will raise an exception. + - "warn": a warning will be issued for each invalid dataset. + - "ignore": invalid dataset will be ignored. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. For an overview of some of the possible options, see the documentation of @@ -1705,7 +1736,32 @@ def open_mfdataset( open_ = open_dataset getattr_ = getattr - datasets = [open_(p, **open_kwargs) for p in paths1d] + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + f"'errors' must be 'raise', 'warn' or 'ignore', got '{errors}'" + ) + + datasets = [] + invalid_paths = set() + for p in paths1d: + try: + ds = open_(p, **open_kwargs) + datasets.append(ds) + except Exception as e: + if errors == "raise": + raise + elif errors == "warn": + emit_user_level_warning(f"Could not open {p} due to {e}. Ignoring.") + # remove invalid paths + invalid_paths.add(p) + + if invalid_paths: + paths = _remove_path(paths, invalid_paths) + if combine == "nested": + # Create new ids and paths based on removed items + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids = list(combined_ids_paths.keys()) + closers = [getattr_(ds, "_close") for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2ff73203580..ef68a959dfc 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5371,6 +5371,68 @@ def test_open_mfdataset_2(self) -> None: ) as actual: assert_identical(original, actual) + def test_open_mfdataset_with_ignore(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_with_warn(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="warn", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_ignore(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2], ["non-existent-file.nc", tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_warn(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2, "non-existent-file.nc"], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="warn", + ) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp1: