Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions tests/test_chunking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
Unit tests for ops.chunking module.
"""
import numpy as np
import pytest
import xarray as xr

from mllam_data_prep.ops.chunking import check_chunk_size, chunk_dataset


@pytest.fixture
def small_dataset():
"""Create a small test dataset."""
return xr.Dataset(
{
"var1": (["x", "y"], np.random.random((10, 10))),
"var2": (["x", "y"], np.random.random((10, 10))),
},
coords={"x": range(10), "y": range(10)},
)


@pytest.fixture
def large_dataset():
"""Create a dataset that will exceed chunk size warning."""
# Create dataset with large chunks that exceed 1GB warning
# Using float64 (8 bytes), need > 1GB / 8 = 134217728 elements
# For simplicity, create a smaller but still large dataset
size = 5000
return xr.Dataset(
{
"large_var": (["x", "y"], np.random.random((size, size))),
},
coords={"x": range(size), "y": range(size)},
)
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

large_dataset allocates a 5000x5000 float64 array (~200MB) during test collection/execution, which is likely to slow down or OOM CI. You can trigger the chunk-size warning without a huge dataset (the implementation only uses chunks and dtype), so this fixture should be removed or made tiny.

Copilot uses AI. Check for mistakes.


def test_check_chunk_size_small_chunks(small_dataset, caplog):
"""Test check_chunk_size with small chunks (should not warn)."""
chunks = {"x": 5, "y": 5}
check_chunk_size(small_dataset, chunks)
# Should not log any warnings
assert len(caplog.records) == 0


def test_check_chunk_size_large_chunks(large_dataset, caplog):
"""Test check_chunk_size with large chunks (should warn)."""
# Use chunks that will create large memory usage
chunks = {"x": 1000, "y": 1000}
check_chunk_size(large_dataset, chunks)
# Should log a warning
assert len(caplog.records) > 0
assert "exceeds" in caplog.records[0].message.lower()
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This warning test is currently inconsistent with the implementation:

  • check_chunk_size computes memory_usage from the requested chunk sizes (product of values in chunks) and dtype, so {"x": 1000, "y": 1000} is only ~8MB and should not exceed the 1GB threshold.
  • check_chunk_size logs via loguru.logger, which caplog does not capture by default, so caplog.records will remain empty.
    Adjust the test to (1) use chunk sizes whose product exceeds the threshold and (2) capture Loguru output using a Loguru sink (or a pytest plugin that bridges Loguru to stdlib logging).

Copilot uses AI. Check for mistakes.


def test_check_chunk_size_missing_dimension(small_dataset):
"""Test check_chunk_size when dimension doesn't exist in variable."""
chunks = {"x": 5, "z": 10} # z doesn't exist
# Should not raise, just skip the missing dimension
check_chunk_size(small_dataset, chunks)


def test_chunk_dataset_success(small_dataset):
"""Test chunk_dataset successfully chunks a dataset."""
chunks = {"x": 5, "y": 5}
chunked = chunk_dataset(small_dataset, chunks)
assert isinstance(chunked, xr.Dataset)
# Check that chunking was applied
assert chunked["var1"].chunks is not None


def test_chunk_dataset_invalid_chunks(small_dataset):
"""Test chunk_dataset with invalid chunk specification."""
chunks = {"x": -1} # Invalid chunk size
with pytest.raises(Exception, match="Error chunking dataset"):
chunk_dataset(small_dataset, chunks)
47 changes: 47 additions & 0 deletions tests/test_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Unit tests for ops.loading module.
"""
import tempfile
from pathlib import Path

Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tempfile and Path are imported but never used in this test module; please remove them to keep the test suite clean (and avoid failing linting if enabled).

Suggested change
import tempfile
from pathlib import Path

Copilot uses AI. Check for mistakes.
import pytest
import xarray as xr

from mllam_data_prep.ops.loading import load_input_dataset


@pytest.fixture
def sample_dataset():
"""Create a simple test dataset."""
return xr.Dataset(
{"var": (["x"], [1, 2, 3])},
coords={"x": [0, 1, 2]},
)


def test_load_input_dataset_zarr(sample_dataset, tmp_path):
"""Test load_input_dataset with zarr format."""
zarr_path = tmp_path / "test.zarr"
sample_dataset.to_zarr(zarr_path, mode="w")

loaded = load_input_dataset(str(zarr_path))
assert isinstance(loaded, xr.Dataset)
assert "var" in loaded.data_vars
assert list(loaded.x.values) == [0, 1, 2]


def test_load_input_dataset_netcdf(sample_dataset, tmp_path):
"""Test load_input_dataset with netCDF format."""
nc_path = tmp_path / "test.nc"
sample_dataset.to_netcdf(nc_path)

Comment on lines +30 to +37
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sample_dataset.to_netcdf(...) requires an optional NetCDF engine (typically scipy, netCDF4, or h5netcdf). The project dependencies don’t appear to include any of these, so this test may fail in CI depending on the environment. Consider using pytest.importorskip(...) for the chosen engine and specifying it explicitly (e.g., engine="scipy"), or adding an explicit test dependency to ensure NetCDF support is available.

Copilot uses AI. Check for mistakes.
loaded = load_input_dataset(str(nc_path))
assert isinstance(loaded, xr.Dataset)
assert "var" in loaded.data_vars
assert list(loaded.x.values) == [0, 1, 2]


def test_load_input_dataset_nonexistent():
"""Test load_input_dataset with non-existent file."""
with pytest.raises((OSError, FileNotFoundError)):
load_input_dataset("/nonexistent/path/to/file.zarr")
84 changes: 84 additions & 0 deletions tests/test_selection_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Unit tests for helper functions in ops.selection module.
"""
import datetime
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datetime is imported but unused in this test file; please remove it to avoid unused-import warnings / lint failures.

Suggested change
import datetime

Copilot uses AI. Check for mistakes.

import pandas as pd
import pytest
import xarray as xr

from mllam_data_prep.ops.selection import check_point_in_dataset, check_step


@pytest.fixture
def simple_time_dataset():
"""Create a simple dataset with time coordinate."""
time_values = pd.date_range("2020-01-01", periods=5, freq="3H")
return xr.Dataset(
{"var": (["time"], range(5))},
coords={"time": time_values},
)


def test_check_point_in_dataset_point_exists(simple_time_dataset):
"""Test check_point_in_dataset when point exists in coordinate."""
point = simple_time_dataset.time.values[2]
# Should not raise
check_point_in_dataset("time", point, simple_time_dataset)


def test_check_point_in_dataset_point_not_exists(simple_time_dataset):
"""Test check_point_in_dataset when point does not exist in coordinate."""
point = pd.Timestamp("2020-01-02T12:00")
with pytest.raises(ValueError, match="Provided value for coordinate time"):
check_point_in_dataset("time", point, simple_time_dataset)


def test_check_point_in_dataset_none_point(simple_time_dataset):
"""Test check_point_in_dataset when point is None (should not raise)."""
# Should not raise when point is None
check_point_in_dataset("time", None, simple_time_dataset)


def test_check_step_constant_step_matches(simple_time_dataset):
"""Test check_step when step is constant and matches requested step."""
requested_step = pd.Timedelta(hours=3)
# Should not raise
check_step(requested_step, "time", simple_time_dataset)


def test_check_step_constant_step_mismatch(simple_time_dataset):
"""Test check_step when step is constant but doesn't match requested step."""
requested_step = pd.Timedelta(hours=6)
with pytest.raises(ValueError, match="Step size for coordinate time"):
check_step(requested_step, "time", simple_time_dataset)


def test_check_step_non_constant_step():
"""Test check_step when step size is not constant."""
# Create dataset with non-constant time steps
time_values = pd.to_datetime(
["2020-01-01T00:00", "2020-01-01T03:00", "2020-01-01T10:00", "2020-01-01T13:00"]
)
ds = xr.Dataset(
{"var": (["time"], range(4))},
coords={"time": time_values},
)
requested_step = pd.Timedelta(hours=3)
with pytest.raises(ValueError, match="Step size for coordinate time is not constant"):
check_step(requested_step, "time", ds)


def test_check_step_single_point_coordinate():
"""Test check_step with single point coordinate (edge case - will raise IndexError)."""
# Create dataset with single time point (diff will be empty array)
time_values = pd.date_range("2020-01-01", periods=1, freq="3H")
ds = xr.Dataset(
{"var": (["time"], [1])},
coords={"time": time_values},
)
requested_step = pd.Timedelta(hours=3)
# This will raise IndexError when trying to access all_steps[0] on empty array
# This documents current behavior - could be improved to raise more descriptive error
with pytest.raises(IndexError):
check_step(requested_step, "time", ds)
Comment on lines +70 to +80
Copy link

Copilot AI Feb 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test locks in an IndexError for the single-point coordinate case, which is an implementation accident (coming from indexing all_steps[0] on an empty diff array). It would be more maintainable to update check_step to raise a descriptive ValueError when ds[coord] has fewer than 2 points, and assert that error type/message here instead.

Copilot uses AI. Check for mistakes.
Loading