diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e0a9853ee45..1d3d6d3bf5d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,8 @@ New Features - Added `scipy-stubs `_ to the ``xarray[types]`` dependencies. By `Joren Hammudoglu `_. +- Added ``errors`` arg to :py:meth:`open_mfdataset` to better handle invalid files. + (:issue:`6736`, :pull:`9955`). By `Pratiman Patel `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -246,26 +248,8 @@ eventually be deprecated. New Features ~~~~~~~~~~~~ -- Relax nanosecond resolution restriction in CF time coding and permit - :py:class:`numpy.datetime64` or :py:class:`numpy.timedelta64` dtype arrays - with ``"s"``, ``"ms"``, ``"us"``, or ``"ns"`` resolution throughout xarray - (:issue:`7493`, :pull:`9618`, :pull:`9977`, :pull:`9966`, :pull:`9999`). By - `Kai Mühlbauer `_ and `Spencer Clark - `_. -- Enable the ``compute=False`` option in :py:meth:`DataTree.to_zarr`. (:pull:`9958`). - By `Sam Levang `_. -- Improve the error message raised when no key is matching the available variables in a dataset. (:pull:`9943`) - By `Jimmy Westling `_. -- Added a ``time_unit`` argument to :py:meth:`CFTimeIndex.to_datetimeindex`. - Note that in a future version of xarray, - :py:meth:`CFTimeIndex.to_datetimeindex` will return a microsecond-resolution - :py:class:`pandas.DatetimeIndex` instead of a nanosecond-resolution - :py:class:`pandas.DatetimeIndex` (:pull:`9965`). By `Spencer Clark - `_ and `Kai Mühlbauer - `_. -- Adds shards to the list of valid_encodings in the zarr backend, so that - sharded Zarr V3s can be written (:issue:`9947`, :pull:`9948`). - By `Jacob Prince_Bieker `_ +- Add new ``errors`` arg to :py:meth:`open_mfdataset` to better handle invalid files. + (:issue:`6736`, :pull:`9955`). By `Pratiman Patel `_. Deprecations ~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f30f4e54705..ca7c648715e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from collections.abc import ( Callable, Hashable, @@ -61,6 +62,7 @@ from xarray.core.types import ( CombineAttrsOptions, CompatOptions, + ErrorOptionsWithWarn, JoinOptions, NestedSequence, ReadBuffer, @@ -1389,6 +1391,38 @@ def open_groups( return groups +def _remove_path(paths, path_to_remove) -> list: + """ + Recursively removes specific path from a nested or non-nested list. + + Parameters + ---------- + paths: list + The path list (nested or not) from which to remove paths. + path_to_remove: str or list + The path to be removed. + + Returns + ------- + list + A new list with specified paths removed. + """ + # Initialize an empty list to store the result + result = [] + + for item in paths: + if isinstance(item, list): + # If the current item is a list, recursively call remove_elements on it + nested_result = _remove_path(item, path_to_remove) + if nested_result: # Only add non-empty lists to avoid adding empty lists + result.append(nested_result) + elif item not in path_to_remove: + # Add the item to the result if it is not in the set of elements to remove + result.append(item) + + return result + + def open_mfdataset( paths: str | os.PathLike @@ -1414,6 +1448,7 @@ def open_mfdataset( join: JoinOptions = "outer", attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", + errors: ErrorOptionsWithWarn = "raise", **kwargs, ) -> Dataset: """Open multiple files as a single dataset. @@ -1540,7 +1575,12 @@ def open_mfdataset( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - **kwargs : optional + errors : {'raise', 'warn', 'ignore'}, default 'raise' + - If 'raise', then invalid dataset will raise an exception. + - If 'warn', then a warning will be issued for each invalid dataset. + - If 'ignore', then invalid dataset will be ignored. + + **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. For an overview of some of the possible options, see the documentation of :py:func:`xarray.open_dataset` @@ -1632,7 +1672,32 @@ def open_mfdataset( open_ = open_dataset getattr_ = getattr - datasets = [open_(p, **open_kwargs) for p in paths1d] + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + f"'errors' must be 'raise', 'warn' or 'ignore', got '{errors}'" + ) + + datasets = [] + remove_paths = False + for p in paths1d: + try: + ds = open_(p, **open_kwargs) + datasets.append(ds) + except Exception: + # remove invalid paths + if combine == "nested": + paths = _remove_path(paths, p) + remove_paths = True + if errors == "raise": + raise + elif errors == "warn": + warnings.warn( + f"Could not open {p}. Ignoring.", UserWarning, stacklevel=2 + ) + continue + else: + continue + closers = [getattr_(ds, "_close") for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] @@ -1645,6 +1710,11 @@ def open_mfdataset( # Combine all datasets, closing them in case of a ValueError try: if combine == "nested": + # Create new ids and paths based on removed items + if remove_paths: + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids = list(combined_ids_paths.keys()) + # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = _nested_combine( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ec9f2fe8aef..a579a520b8e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4978,6 +4978,68 @@ def test_open_mfdataset_2(self) -> None: ) as actual: assert_identical(original, actual) + def test_open_mfdataset_with_ignore(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_with_warn(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="warn", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_ignore(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2], ["non-existent-file.nc", tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_warn(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2, "non-existent-file.nc"], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="warn", + ) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp1: