Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Roundtrip tests kerchunk tests without requiring kerchunk #405

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions virtualizarr/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import importlib
import itertools

import fsspec
import numpy as np
import pytest
import xarray as xr
from packaging.version import Version

from virtualizarr.manifests import ChunkManifest, ManifestArray
Expand Down Expand Up @@ -105,3 +107,15 @@ def offset_from_chunk_key(ind: tuple[int, ...]) -> int:

def length_from_chunk_key(ind: tuple[int, ...]) -> int:
return sum(ind) + 5


def open_dataset_kerchunk(
filename_or_obj: str, *, storage_options=None, **kwargs
) -> xr.Dataset:
"""Equivalent to ``xr.open_dataset(..., engine="kerchunk")`` but without depending on
kerchunk library
"""
m = fsspec.filesystem(
"reference", fo=filename_or_obj, **(storage_options or {})
).get_mapper()
return xr.open_dataset(m, engine="zarr", consolidated=False, **kwargs)
3 changes: 2 additions & 1 deletion virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import (
has_astropy,
open_dataset_kerchunk,
parametrize_over_hdf_backends,
requires_hdf5plugin,
requires_imagecodecs,
Expand Down Expand Up @@ -321,7 +322,7 @@ def test_virtualizarr_vs_local_nisar(self, hdf_backend):
)
tmpref = "/tmp/cmip6.json"
vds.virtualize.to_kerchunk(tmpref, format="json")
dsV = xr.open_dataset(tmpref, engine="kerchunk")
dsV = open_dataset_kerchunk(tmpref)

# xrt.assert_identical(dsXR, dsV) #Attribute order changes
xrt.assert_equal(dsXR, dsV)
Expand Down
38 changes: 22 additions & 16 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests import parametrize_over_hdf_backends, requires_kerchunk
from virtualizarr.tests import (
has_kerchunk,
open_dataset_kerchunk,
parametrize_over_hdf_backends,
requires_kerchunk,
requires_zarr_python,
)
from virtualizarr.translators.kerchunk import (
dataset_from_kerchunk_refs,
)
Expand Down Expand Up @@ -84,8 +90,10 @@ def test_numpy_arrays_to_inlined_kerchunk_refs(
assert refs["refs"]["time/0"] == expected["refs"]["time/0"]


@requires_kerchunk
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
@requires_zarr_python
@pytest.mark.parametrize(
"format", ["dict", "json", "parquet"] if has_kerchunk else ["dict", "json"]
)
class TestKerchunkRoundtrip:
@parametrize_over_hdf_backends
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
Expand All @@ -103,14 +111,14 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format, hdf_backend):
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
roundtrip = open_dataset_kerchunk(
f"{tmpdir}/refs.{format}", decode_times=False
)

# assert all_close to original dataset
Expand Down Expand Up @@ -164,16 +172,14 @@ def test_kerchunk_roundtrip_concat(
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(
ds_refs, engine="kerchunk", decode_times=decode_times
)
roundtrip = open_dataset_kerchunk(ds_refs, decode_times=decode_times)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
roundtrip = open_dataset_kerchunk(
f"{tmpdir}/refs.{format}", decode_times=decode_times
)

if decode_times is False:
Expand Down Expand Up @@ -214,14 +220,14 @@ def test_non_dimension_coordinates(self, tmpdir, format, hdf_backend):
ds_refs = vds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
roundtrip = open_dataset_kerchunk(ds_refs, decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
roundtrip = open_dataset_kerchunk(
f"{tmpdir}/refs.{format}", decode_times=False
)

# assert equal to original dataset
Expand Down Expand Up @@ -265,13 +271,13 @@ def test_datetime64_dtype_fill_value(self, tmpdir, format):
ds_refs = ds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
roundtrip = open_dataset_kerchunk(ds_refs)
else:
# write those references to disk as kerchunk references format
ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
roundtrip = open_dataset_kerchunk(f"{tmpdir}/refs.{format}")

assert roundtrip.a.attrs == ds.a.attrs

Expand Down
19 changes: 12 additions & 7 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,21 @@

import virtualizarr
from virtualizarr.readers.hdf import HDFVirtualBackend
from virtualizarr.tests import requires_kerchunk
from virtualizarr.tests import (
open_dataset_kerchunk,
requires_hdf5plugin,
requires_imagecodecs,
)


@requires_kerchunk
@requires_hdf5plugin
@requires_imagecodecs
class TestIntegration:
@pytest.mark.xfail(
reason="0 time start is being interpreted as fillvalue see issues/280"
)
def test_filters_h5netcdf_roundtrip(
self, tmpdir, filter_encoded_roundtrip_hdf5_file, backend=HDFVirtualBackend
self, tmpdir, filter_encoded_roundtrip_hdf5_file
):
ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
vds = virtualizarr.open_virtual_dataset(
Expand All @@ -24,7 +29,7 @@ def test_filters_h5netcdf_roundtrip(
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
roundtrip = open_dataset_kerchunk(kerchunk_file, decode_times=True)
xrt.assert_allclose(ds, roundtrip)

@pytest.mark.xfail(
Expand All @@ -37,8 +42,8 @@ def test_filters_netcdf4_roundtrip(
ds = xr.open_dataset(filepath)
vds = virtualizarr.open_virtual_dataset(filepath, backend=HDFVirtualBackend)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
vds.virtualize.to_kerchunk(kerchunk_file, format="dict")
roundtrip = open_dataset_kerchunk(kerchunk_file)
xrt.assert_equal(ds, roundtrip)

def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
Expand All @@ -48,5 +53,5 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file
)
kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
roundtrip = open_dataset_kerchunk(kerchunk_file)
xrt.assert_allclose(ds, roundtrip)
Loading