Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to custom netcdf4/hdf5 backend #395

Merged
merged 12 commits into from
Jan 30, 2025
6 changes: 3 additions & 3 deletions virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from virtualizarr.readers import (
DMRPPVirtualBackend,
FITSVirtualBackend,
HDF5VirtualBackend,
HDFVirtualBackend,
KerchunkVirtualBackend,
NetCDF3VirtualBackend,
TIFFVirtualBackend,
Expand All @@ -27,9 +27,9 @@
"kerchunk": KerchunkVirtualBackend,
"zarr_v3": ZarrV3VirtualBackend,
"dmrpp": DMRPPVirtualBackend,
"hdf5": HDFVirtualBackend,
"netcdf4": HDFVirtualBackend, # note this is the same as for hdf5
# all the below call one of the kerchunk backends internally (https://fsspec.github.io/kerchunk/reference.html#file-format-backends)
"hdf5": HDF5VirtualBackend,
"netcdf4": HDF5VirtualBackend, # note this is the same as for hdf5
"netcdf3": NetCDF3VirtualBackend,
"tiff": TIFFVirtualBackend,
"fits": FITSVirtualBackend,
Expand Down
7 changes: 7 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests.manifest import join
from virtualizarr.readers import HDF5VirtualBackend
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.zarr import ZArray, ceildiv

requires_network = pytest.mark.network
Expand Down Expand Up @@ -42,6 +44,11 @@ def _importorskip(
has_zarr_python, requires_zarr_python = _importorskip("zarr")
has_zarr_python_v3, requires_zarr_python_v3 = _importorskip("zarr", "3.0.0b")

parametrize_over_hdf_backends = pytest.mark.parametrize(
"hdf_backend",
[HDF5VirtualBackend, HDFVirtualBackend] if has_kerchunk else [HDFVirtualBackend],
)


def create_manifestarray(
shape: tuple[int, ...], chunks: tuple[int, ...]
Expand Down
80 changes: 45 additions & 35 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
has_astropy,
requires_kerchunk,
parametrize_over_hdf_backends,
requires_network,
requires_s3fs,
requires_scipy,
Expand Down Expand Up @@ -82,8 +82,7 @@ def test_FileType():
FileType(None)


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetIndexes:
def test_no_indexes(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
Expand Down Expand Up @@ -122,8 +121,7 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
return True


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_cftime_index(tmpdir, hdf_backend):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
Expand Down Expand Up @@ -152,8 +150,7 @@ def test_cftime_index(tmpdir, hdf_backend):
assert vds.attrs == ds.attrs


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetAttrs:
def test_drop_array_dimensions(self, netcdf4_file, hdf_backend):
# regression test for GH issue #150
Expand All @@ -171,14 +168,16 @@ def test_coordinate_variable_attrs_preserved(self, netcdf4_file, hdf_backend):
}


@requires_kerchunk
@parametrize_over_hdf_backends
class TestDetermineCoords:
def test_infer_one_dimensional_coords(self, netcdf4_file):
vds = open_virtual_dataset(netcdf4_file, indexes={})
def test_infer_one_dimensional_coords(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
assert set(vds.coords) == {"time", "lat", "lon"}

def test_var_attr_coords(self, netcdf4_file_with_2d_coords):
vds = open_virtual_dataset(netcdf4_file_with_2d_coords, indexes={})
def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
vds = open_virtual_dataset(
netcdf4_file_with_2d_coords, indexes={}, backend=hdf_backend
)

expected_dimension_coords = ["ocean_time", "s_rho"]
expected_2d_coords = ["lon_rho", "lat_rho", "h"]
Expand All @@ -199,7 +198,7 @@ class TestReadFromS3:
@pytest.mark.parametrize(
"indexes", [None, {}], ids=["None index", "empty dict index"]
)
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_anon_read_s3(self, indexes, hdf_backend):
"""Parameterized tests for empty vs supplied indexes and filetypes."""
# TODO: Switch away from this s3 url after minIO is implemented.
Expand All @@ -217,7 +216,7 @@ def test_anon_read_s3(self, indexes, hdf_backend):


@requires_network
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestReadFromURL:
@pytest.mark.parametrize(
"filetype, url",
Expand Down Expand Up @@ -318,46 +317,55 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend):
xrt.assert_equal(dsXR, dsV)


@requires_kerchunk
def test_open_empty_group(empty_netcdf4_file):
vds = open_virtual_dataset(empty_netcdf4_file, indexes={})
assert isinstance(vds, xr.Dataset)
expected = Dataset()
xrt.assert_identical(vds, expected)


@requires_kerchunk
@parametrize_over_hdf_backends
class TestOpenVirtualDatasetHDFGroup:
jsignell marked this conversation as resolved.
Show resolved Hide resolved
def test_open_subgroup(self, netcdf4_file_with_data_in_multiple_groups):
def test_open_empty_group(self, empty_netcdf4_file, hdf_backend):
vds = open_virtual_dataset(empty_netcdf4_file, indexes={}, backend=hdf_backend)
assert isinstance(vds, xr.Dataset)
expected = Dataset()
xrt.assert_identical(vds, expected)

def test_open_subgroup(
self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, group="subgroup", indexes={}
netcdf4_file_with_data_in_multiple_groups,
group="subgroup",
indexes={},
backend=hdf_backend,
)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what the output looks like for the kerchunk-based reader:

<xarray.Dataset> Size: 16B
Dimensions:  (dim_0: 2)
Dimensions without coordinates: dim_0
Data variables:
    bar      (dim_0) int64 16B ManifestArray<shape=(2,), dtype=int64, chunks=...

vs the custom reader:

<xarray.Dataset> Size: 32B
Dimensions:  (dim_0: 2)
Coordinates:
    dim_0    (dim_0) float64 16B 0.0 0.0
Data variables:
    bar      (dim_0) int64 16B ManifestArray<shape=(2,), dtype=int64, chunks=...

for reference here is what it looks like if I just naively open it as a regular dataset:

(Pdb) xr.open_dataset(netcdf4_file_with_data_in_multiple_groups, group="subgroup")
<xarray.Dataset> Size: 16B
Dimensions:  (dim_0: 2)
Dimensions without coordinates: dim_0
Data variables:
    bar      (dim_0) int64 16B ...

I think this is probably the source of the nbytes difference too

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems like a bug in the HDF reader. If xarray doesn't think this dimension has a coordinate, then virtualizarr's HDF reader shouldn't create one either.

Opened #401 to track this (FYI @sharkinsspatial )

assert list(vds.variables) == ["bar"]
assert isinstance(vds["bar"].data, ManifestArray)
assert vds["bar"].shape == (2,)

def test_open_root_group_manually(self, netcdf4_file_with_data_in_multiple_groups):
def test_open_root_group_manually(
jsignell marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't this test be combined with the one below by parameterizing group over ("", None)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 8706af5

self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, group="", indexes={}
netcdf4_file_with_data_in_multiple_groups,
group="",
indexes={},
backend=hdf_backend,
)
assert list(vds.variables) == ["foo"]
assert isinstance(vds["foo"].data, ManifestArray)
assert vds["foo"].shape == (3,)

def test_open_root_group_by_default(
self, netcdf4_file_with_data_in_multiple_groups
self, netcdf4_file_with_data_in_multiple_groups, hdf_backend
):
vds = open_virtual_dataset(
netcdf4_file_with_data_in_multiple_groups, indexes={}
netcdf4_file_with_data_in_multiple_groups,
indexes={},
backend=hdf_backend,
)
assert list(vds.variables) == ["foo"]
assert isinstance(vds["foo"].data, ManifestArray)
assert vds["foo"].shape == (3,)


@requires_kerchunk
class TestLoadVirtualDataset:
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_loadable_variables(self, netcdf4_file, hdf_backend):
vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
Expand Down Expand Up @@ -397,7 +405,7 @@ def test_explicit_filetype_and_backend(self, netcdf4_file):
netcdf4_file, filetype="hdf", backend=HDFVirtualBackend
)

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
if hdf_backend == HDFVirtualBackend:
with pytest.raises(NotImplementedError, match="Nested groups"):
Expand All @@ -408,7 +416,9 @@ def test_group_kwarg(self, hdf5_groups_file, hdf_backend):
)
if hdf_backend == HDF5VirtualBackend:
with pytest.raises(ValueError, match="not found in"):
open_virtual_dataset(hdf5_groups_file, group="doesnt_exist")
open_virtual_dataset(
hdf5_groups_file, group="doesnt_exist", backend=hdf_backend
)

vars_to_load = ["air", "time"]
vds = open_virtual_dataset(
Expand Down Expand Up @@ -441,13 +451,13 @@ def test_open_virtual_dataset_passes_expected_args(
}
mock_read_kerchunk.assert_called_once_with(**args)

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_dataset_with_empty(self, hdf5_empty, hdf_backend):
vds = open_virtual_dataset(hdf5_empty, backend=hdf_backend)
assert vds.empty.dims == ()
assert vds.empty.attrs == {"empty": "true"}

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_dataset_with_scalar(self, hdf5_scalar, hdf_backend):
vds = open_virtual_dataset(hdf5_scalar, backend=hdf_backend)
assert vds.scalar.dims == ()
Expand Down
20 changes: 7 additions & 13 deletions virtualizarr/tests/test_integration.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These mostly still do not run in a kerchunk-free env. See #376 for that piece

Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.readers import HDF5VirtualBackend
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
)
Expand Down Expand Up @@ -61,7 +59,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
),
],
)
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_numpy_arrays_to_inlined_kerchunk_refs(
netcdf4_file, inline_threshold, vars_to_inline, hdf_backend
):
Expand Down Expand Up @@ -89,7 +87,7 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
Expand Down Expand Up @@ -122,7 +120,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
for coord in ds.coords:
assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(
self, tmpdir, format, hdf_backend, decode_times, time_vars
Expand Down Expand Up @@ -192,7 +190,7 @@ def test_kerchunk_roundtrip_concat(
assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]

@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
# regression test for GH issue #105

Expand Down Expand Up @@ -278,8 +276,7 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
assert roundtrip.a.attrs == ds.a.attrs


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
def test_open_scalar_variable(tmpdir, hdf_backend):
# regression test for GH issue #100

Expand All @@ -290,9 +287,8 @@ def test_open_scalar_variable(tmpdir, hdf_backend):
assert vds["a"].shape == ()


@parametrize_over_hdf_backends
class TestPathsToURIs:
@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)

Expand All @@ -302,8 +298,6 @@ def test_convert_absolute_paths_to_uris(self, netcdf4_file, hdf_backend):
path = manifest["0.0.0"]["path"]
assert path == expected_path

@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
def test_convert_relative_paths_to_uris(self, netcdf4_file, hdf_backend):
relative_path = relpath(netcdf4_file)
vds = open_virtual_dataset(relative_path, indexes={}, backend=hdf_backend)
Expand Down
16 changes: 6 additions & 10 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.readers import HDF5VirtualBackend, HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import parametrize_over_hdf_backends
from virtualizarr.zarr import ZArray


Expand Down Expand Up @@ -227,8 +226,7 @@ def test_concat_dim_coords_along_existing_dim(self):
assert result.data.zarray.zarr_format == zarray.zarr_format


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [HDF5VirtualBackend, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestCombineUsingIndexes:
def test_combine_by_coords(self, netcdf4_files_factory: Callable, hdf_backend):
filepath1, filepath2 = netcdf4_files_factory()
Expand Down Expand Up @@ -262,8 +260,7 @@ def test_combine_by_coords_keeping_manifestarrays(self, netcdf4_files, hdf_backe
assert isinstance(combined_vds["lon"].data, ManifestArray)


@requires_kerchunk
@pytest.mark.parametrize("hdf_backend", [None, HDFVirtualBackend])
@parametrize_over_hdf_backends
class TestRenamePaths:
def test_rename_to_str(self, netcdf4_file, hdf_backend):
vds = open_virtual_dataset(netcdf4_file, indexes={}, backend=hdf_backend)
Expand Down Expand Up @@ -313,14 +310,13 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(
assert isinstance(renamed_vds["lat"].data, np.ndarray)


@requires_kerchunk
def test_nbytes(simple_netcdf4):
vds = open_virtual_dataset(simple_netcdf4)
assert vds.virtualize.nbytes == 32
assert vds.nbytes == 48
assert vds.virtualize.nbytes == 88
assert vds.nbytes == 104

vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"])
assert vds.virtualize.nbytes == 48
assert vds.virtualize.nbytes == 104
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess I am not concerned that the nbytes is different now?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That seems weird... I would have expected them to be the same.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this has to do with coordinates being populated for dimensions that are supposed to be "without coordinates" see https://github.com/zarr-developers/VirtualiZarr/pull/395/files#r1934273183.


ds = open_dataset(simple_netcdf4)
assert ds.virtualize.nbytes == ds.nbytes
Loading