Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
run: uv python install

- name: Install the package
run: uv sync --all-extras --all-groups && uv pip install .
run: uv sync --refresh --all-extras --all-groups && uv pip install .

- name: Run tests
run: uv run pytest
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ To download all the data used for the benchmark run the following commands:
```bash
uv run python -m climatebenchpress.data_loader.datasets.esa_biomass_cci
uv run python -m climatebenchpress.data_loader.datasets.cams
uv run python -m climatebenchpress.data_loader.datasets.era5
uv run python -m climatebenchpress.data_loader.datasets.ifs_uncompressed
Comment thread
treigerm marked this conversation as resolved.
uv run python -m climatebenchpress.data_loader.datasets.nextgems
uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_ta
uv run python -m climatebenchpress.data_loader.datasets.cmip6.access_tos
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ dependencies = [
"cf-xarray~=0.10.0",
"cftime~=1.6.0",
"dask>=2024.12.0,<2025.4",
"earthkit-regrid~=0.5.0",
"fsspec>=2024.10.0,<2025.4",
"gribscan~=0.0.14",
"healpy~=1.18.0",
# These versions need to be pinned to be compatible with the NextGEMS
# catalog at https://data.nextgems-h2020.eu/online.yaml.
Expand Down Expand Up @@ -52,5 +54,5 @@ where = ["src"]
addopts = ["--import-mode=importlib"]

[[tool.mypy.overrides]]
module = ["fsspec.*", "intake.*", "healpy.*"]
module = ["fsspec.*", "intake.*", "healpy.*", "earthkit.*"]
follow_untyped_imports = true
1 change: 1 addition & 0 deletions src/climatebenchpress/data_loader/datasets/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .cmip6.all import *
from .era5 import *
from .esa_biomass_cci import *
from .ifs_uncompressed import *
from .nextgems import *
172 changes: 172 additions & 0 deletions src/climatebenchpress/data_loader/datasets/ifs_uncompressed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
__all__ = ["IFSUncompressedDataset"]

import argparse
from pathlib import Path

import earthkit.regrid
import numpy as np
import requests
import xarray as xr

from .. import (
monitor,
open_downloaded_canonicalized_dataset,
open_downloaded_tiny_canonicalized_dataset,
)
from .abc import Dataset

BASE_URL = "https://object-store.os-api.cci1.ecmwf.int/esiwacebucket"


class IFSUncompressedDataset(Dataset):
"""Dataset for IFS uncompressed data.

Contains data from the [hplp](https://apps.ecmwf.int/ifs-experiments/rd/hplp/)
experiment from the Integrated Forecasting System (IFS) model. Crucially,
this dataset contains uncompressed 64-bit floating point data.
"""

name = "ifs-uncompressed"

@staticmethod
def download(download_path: Path, progress: bool = True):
ds = load_hplp_data(leveltype="sfc", gridtype="reduced_gg")
ds = ds[["msl", "10u", "10v"]]
ds_regridded = regrid_to_regular(
ds,
in_grid={"grid": "O400"},
out_grid={"grid": [0.25, 0.25]},
)
downloadfile = download_path / "ifs_uncompressed.zarr"
with monitor.progress_bar(progress):
ds_regridded.to_zarr(
downloadfile, mode="w", encoding=dict(), compute=False
).compute()

@staticmethod
def open(download_path: Path) -> xr.Dataset:
ds = xr.open_dataset(download_path / "ifs_uncompressed.zarr")

# Needed to make the dataset CF-compliant.
ds.longitude.attrs["axis"] = "X"
ds.latitude.attrs["axis"] = "Y"
ds.time.attrs["standard_name"] = "time"
return ds


def load_hplp_data(leveltype=None, gridtype=None, step=None, remap=False):
"""Function taken from: https://github.com/climet-eu/compression-lab-notebooks/blob/main/04-example-datasets/01-hplp.ipynb."""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please use a permalink here

if leveltype not in {"pl", "ml", "sfc", "wave"}:
raise ValueError(
f"Invalid leveltype: '{leveltype}'. Available leveltypes: pl, ml, sfc, wave"
)

if leveltype in {"ml", "pl"} and not gridtype:
raise ValueError(
f"Gridtype is required for leveltype '{leveltype}'. Available: reduced_gg, sh"
)

if remap and gridtype != "sh":
raise ValueError("Only 'sh' fields can be remapped.")

if leveltype == "wave" and gridtype != "reduced_ll":
print("Warning: Wave model data are stored on a reduced_ll grid.")

if leveltype == "sfc" and gridtype != "reduced_gg":
print("Warning: Surface level data are stored on a reduced_gg grid.")

if step and not (leveltype == "ml" and gridtype == "reduced_gg"):
print(
"Warning: Specifying 'step' is unnecessary for this configuration and will be ignored."
)

if leveltype in {"sfc", "wave"}:
url = f"{BASE_URL}/hplp/hplp_{leveltype}.grib"
elif leveltype == "ml" and gridtype == "reduced_gg":
if step is None:
raise ValueError(
"The ml reduced_gg data are split into two parts:\n"
" - Steps: 0, 12, 24, 36, 48, 60, 72, 84, 96, 108, 120 (2020-07-21T00:00:00 to 2020-07-26T00:00:00)\n"
" - Steps: 132, 144, 156, 168, 180, 192, 204, 216, 228, 240 (2020-07-26T12:00:00 to 2020-07-31T00:00:00)\n"
"Specify a step smaller than 120 for accessing the first part, \n"
"and a step greater or equal to 132 for accessing the second part."
)
if step <= 120:
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_0_120.grib"
else:
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}_levels_132_240.grib"
else:
url = f"{BASE_URL}/hplp/hplp_{leveltype}_{gridtype}" + (
"_O400.grib" if remap else ".grib"
)
ref = requests.get(f"{url}.ref").json()

print(f"Loading dataset {url}")

return xr.open_dataset(
"reference://",
engine="zarr",
backend_kwargs=dict(storage_options=dict(fo=ref, asynchronous=False)),
consolidated=False,
)


def regrid_to_regular(ds, in_grid, out_grid):
"""Regrid dataset to a regular lat-lon grid.

Parameters
----------
ds : xr.Dataset
The input dataset to regrid
in_grid : dict
The input grid specification for earthkit.regrid.interpolate
out_grid : dict
The output grid specification for earthkit.regrid.interpolate. Is assumed to be
a regular lat-lon grid with equal spacing in latitude and longitude, e.g. {"grid": [0.25, 0.25]}.
"""
out_data = {var: [] for var in ds.data_vars}
for var in ds.data_vars:
for time in ds.time:
r = earthkit.regrid.interpolate(
ds[var].sel(time=time).values,
in_grid=in_grid,
out_grid=out_grid,
method="linear",
)
out_data[var].append(r)

dx = out_grid["grid"][0]
assert (
out_grid["grid"][0] == out_grid["grid"][1]
), "Only grids with equal latitude and longitude spacing are supported."
lats = np.linspace(90, -90, int(180 / dx) + 1)
lons = np.linspace(0, 360 - dx, int(360 / dx))
coords = {
"time": ds.time,
"latitude": lats,
"longitude": lons,
}
out_ds = xr.Dataset(
{
var: (("time", "latitude", "longitude"), out_data[var])
for var in ds.data_vars
},
coords=coords,
)
return out_ds


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--basepath", type=Path, default=Path())
args = parser.parse_args()

ds = open_downloaded_canonicalized_dataset(
IFSUncompressedDataset, basepath=args.basepath
)
open_downloaded_tiny_canonicalized_dataset(
IFSUncompressedDataset, basepath=args.basepath
)

for v, da in ds.items():
print(f"- {v}: {da.dims}")