Skip to content

Commit 4d0721d

Browse files
authored
Merge pull request #923 from ayushnag/dmrpp_bugs_and_tests
Fix small open_virtual_dataset bugs
2 parents 91866ac + f39b151 commit 4d0721d

File tree

5 files changed

+62
-51
lines changed

5 files changed

+62
-51
lines changed

earthaccess/dmrpp_zarr.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ def open_virtual_mfdataset(
9292
import xarray as xr
9393

9494
if access == "direct":
95-
fs = earthaccess.get_s3_filesystem(results=granules[0])
96-
fs.storage_options["anon"] = False # type: ignore
95+
fs = earthaccess.get_s3_filesystem(results=granules) # type: ignore
96+
fs.storage_options["anon"] = False
9797
else:
9898
fs = earthaccess.get_fsspec_https_session()
9999
if parallel:
@@ -114,7 +114,7 @@ def open_virtual_mfdataset(
114114
filetype="dmrpp", # type: ignore
115115
group=group,
116116
indexes={},
117-
reader_options={"storage_options": fs.storage_options}, # type: ignore
117+
reader_options={"storage_options": fs.storage_options},
118118
)
119119
)
120120
if preprocess is not None:
@@ -127,6 +127,7 @@ def open_virtual_mfdataset(
127127
vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs)
128128
if load:
129129
refs = vds.virtualize.to_kerchunk(filepath=None, format="dict")
130+
protocol = "s3" if "s3" in fs.protocol else fs.protocol
130131
return xr.open_dataset(
131132
"reference://",
132133
engine="zarr",
@@ -135,8 +136,8 @@ def open_virtual_mfdataset(
135136
"consolidated": False,
136137
"storage_options": {
137138
"fo": refs, # codespell:ignore
138-
"remote_protocol": fs.protocol,
139-
"remote_options": fs.storage_options, # type: ignore
139+
"remote_protocol": protocol,
140+
"remote_options": fs.storage_options,
140141
},
141142
},
142143
)

earthaccess/kerchunk.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import fsspec.utils
77
import s3fs
88

9+
# import ipdb
910
import earthaccess
1011

1112

@@ -15,12 +16,19 @@ def _get_chunk_metadata(
1516
) -> list[dict]:
1617
from kerchunk.hdf import SingleHdf5ToZarr
1718

19+
if not isinstance(granule, earthaccess.DataGranule) and isinstance(granule, dict):
20+
# WHY: dask serialization is doing something weird, it serializes the granule as a simple dict
21+
# we need to add cast it back to a datagranule to get the nice methods for parsing the data links
22+
# TODO: ask James what is going on
23+
granule = earthaccess.DataGranule(granule)
24+
1825
metadata = []
1926
access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect"
27+
# ipdb.set_trace()
2028

2129
for url in granule.data_links(access=access):
2230
with fs.open(url) as inf:
23-
h5chunks = SingleHdf5ToZarr(inf, url)
31+
h5chunks = SingleHdf5ToZarr(inf, url) # type: ignore
2432
m = h5chunks.translate()
2533
metadata.append(m)
2634

@@ -50,6 +58,8 @@ def consolidate_metadata(
5058

5159
# Get metadata for each granule
5260
get_chunk_metadata = dask.delayed(_get_chunk_metadata) # type: ignore
61+
62+
# ipdb.set_trace()
5363
chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) # type: ignore
5464
chunks = sum(chunks, start=[])
5565

pyproject.toml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,19 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md"
5757

5858
[project.optional-dependencies]
5959
kerchunk = [
60-
"numpy >=1.26.4",
6160
"kerchunk",
6261
"dask",
6362
"h5py >=3.6.0",
6463
"h5netcdf",
6564
"xarray",
65+
"zarr >=2.12.0, <3.0.0a",
6666
]
6767
virtualizarr = [
68-
"virtualizarr >=1.2.0"
68+
"numpy >=1.26.4",
69+
"zarr >=2.12.0, <3.0.0a",
70+
"virtualizarr >=1.2.0",
71+
"dask",
72+
"h5py >=3.6.0",
6973
]
7074
dev = [
7175
"bump-my-version >=0.10.0",
@@ -75,6 +79,8 @@ dev = [
7579
"uv >=0.4.7",
7680
]
7781
test = [
82+
"zarr >=2.12.0, <3.0.0a",
83+
"numpy >=1.26.4",
7884
"mypy >=1.11.2",
7985
"pytest >=8.3",
8086
"pytest-cov >=5.0",

tests/integration/test_virtualizarr.py

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,41 +15,24 @@
1515
logger.info(f"earthaccess version: {earthaccess.__version__}")
1616

1717

18-
@pytest.fixture(scope="module", params=["MUR25-JPL-L4-GLOB-v04.2"])
18+
@pytest.fixture(
19+
scope="module",
20+
params=[
21+
"MUR25-JPL-L4-GLOB-v04.2",
22+
"AVHRR_OI-NCEI-L4-GLOB-v2.1",
23+
"M2T1NXSLV",
24+
],
25+
)
1926
def granule(request):
2027
granules = earthaccess.search_data(
2128
count=1, temporal=("2024"), short_name=request.param
2229
)
2330
return granules[0]
2431

2532

26-
def test_dmrpp(granule):
27-
from virtualizarr import open_virtual_dataset # type: ignore
28-
29-
fs = earthaccess.get_fsspec_https_session()
30-
data_path = granule.data_links(access="indirect")[0]
31-
dmrpp_path = data_path + ".dmrpp"
32-
33-
result = open_virtual_dataset(
34-
dmrpp_path,
35-
filetype="dmrpp", # type: ignore
36-
indexes={},
37-
reader_options={"storage_options": fs.storage_options}, # type: ignore
38-
)
39-
40-
expected = open_virtual_dataset(
41-
data_path,
42-
indexes={},
43-
reader_options={"storage_options": fs.storage_options}, # type: ignore
44-
)
45-
46-
# TODO: replace with xr.testing when virtualizarr fill_val is fixed (https://github.com/zarr-developers/VirtualiZarr/issues/287)
47-
# and dmrpp deflateLevel (zlib compression level) is always present (https://github.com/OPENDAP/bes/issues/954)
48-
for var in result.variables:
49-
assert var in expected.variables
50-
assert result[var].dims == expected[var].dims
51-
assert result[var].shape == expected[var].shape
52-
assert result[var].dtype == expected[var].dtype
53-
assert result[var].data.manifest == expected[var].data.manifest
54-
assert set(result.coords) == set(expected.coords)
55-
assert result.attrs == expected.attrs
33+
def test_open_virtual_dataset(granule):
34+
# Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr
35+
vds = earthaccess.open_virtual_dataset(granule, load=False)
36+
assert vds is not None
37+
vds_load = earthaccess.open_virtual_dataset(granule, load=True)
38+
assert vds_load is not None

uv.lock

Lines changed: 23 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)