mllam · matschreiner · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/.github/workflows/python-package-pip.yml b/.github/workflows/python-package-pip.yml
@@ -25,10 +25,11 @@ jobs:
         run: |
           python -m pip install . "zarr${{ matrix.zarr-version }}"
           python -m pip install pytest
+          python -m pip install pytest-coverage
 
       - name: Run tests
         env:
           AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
           AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         run: |
-          python -m pytest tests/
+          python -m pytest tests/ --cov mllam_data_prep --cov-fail-under=100 --cov-report term-missing
diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py
@@ -1,8 +1,10 @@
+# pragma: no cover
+
 import importlib.metadata
 
-try:
+try:  # pragma: no cover
     __version__ = importlib.metadata.version(__name__)
-except importlib.metadata.PackageNotFoundError:
+except importlib.metadata.PackageNotFoundError:  # pragma: no cover
     __version__ = "unknown"
 
 # expose the public API

diff --git a/mllam_data_prep/__main__.py b/mllam_data_prep/__main__.py
@@ -1,20 +1,20 @@
-import os
-from pathlib import Path
+import os  # pragma: no cover
+from pathlib import Path  # pragma: no cover
 
-from loguru import logger
+from loguru import logger  # pragma: no cover
 
-from .create_dataset import create_dataset_zarr
+from .create_dataset import create_dataset_zarr  # pragma: no cover
 
 # Attempt to import psutil and dask.distributed modules
-DASK_DISTRIBUTED_AVAILABLE = True
-try:
+DASK_DISTRIBUTED_AVAILABLE = True  # pragma: no cover
+try:  # pragma: no cover
     import psutil
     from dask.diagnostics import ProgressBar
     from dask.distributed import LocalCluster
-except (ImportError, ModuleNotFoundError):
+except (ImportError, ModuleNotFoundError):  # pragma: no cover
     DASK_DISTRIBUTED_AVAILABLE = False
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     import argparse
 
     parser = argparse.ArgumentParser(

diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py
@@ -26,7 +26,7 @@ def validate_config(config_inputs):
 
     for input_dataset_name, input_dataset in config_inputs.items():
         if not input_dataset.variables and not input_dataset.derived_variables:
-            raise InvalidConfigException(
+            raise InvalidConfigException(  # pragma: no cover
                 f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or"
                 " `derived_variables`. Make sure that you update the config so that the input"
                 f" dataset '{input_dataset_name}' contains at least either a `variables` or"
@@ -36,16 +36,16 @@ def validate_config(config_inputs):
             # Check so that there are no overlapping variables
             if isinstance(input_dataset.variables, list):
                 variable_vars = input_dataset.variables
-            elif isinstance(input_dataset.variables, dict):
+            elif isinstance(input_dataset.variables, dict):  # pragma: no cover
                 variable_vars = input_dataset.variables.keys()
             else:
-                raise TypeError(
+                raise TypeError(  # pragma: no cover
                     f"Expected an instance of list or dict, but got {type(input_dataset.variables)}."
                 )
             derived_variable_vars = input_dataset.derived_variables.keys()
             common_vars = list(set(variable_vars) & set(derived_variable_vars))
             if len(common_vars) > 0:
-                raise InvalidConfigException(
+                raise InvalidConfigException(  # pragma: no cover
                     "Both `variables` and `derived_variables` include the following variables name(s):"
                     f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there"
                     " are no overlapping variable names between `variables` and `derived_variables`,"
@@ -382,7 +382,7 @@ class _(JSONWizard.Meta):
         raise_on_unknown_json_key = True
 
 
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     import argparse
 
     argparser = argparse.ArgumentParser()

diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
@@ -22,10 +22,10 @@
 from .ops.statistics import calc_stats
 from .ops.subsetting import extract_variable
 
-if Version(zarr.__version__) >= Version("3"):
+if Version(zarr.__version__) >= Version("3"):  # pragma: no cover
     from zarr.codecs import BloscCodec, BloscShuffle
 else:
-    from numcodecs import Blosc
+    from numcodecs import Blosc  # pragma: no cover
 
 # The config versions defined in SUPPORTED_CONFIG_VERSIONS are the ones currently supported.
 # The `extra` field in the config that was added between v0.2.0 and v0.5.0 is optional, and
@@ -38,7 +38,7 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name):
     # check that the dataset has the expected attributes with the expected values
     missing_attributes = set(expected_attributes.keys()) - set(ds.attrs.keys())
     if len(missing_attributes) > 0:
-        raise ValueError(
+        raise ValueError(  # pragma: no cover
             f"Dataset {dataset_name} is missing the following attributes: {missing_attributes}"
         )
 
@@ -47,13 +47,13 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name):
         key: val for key, val in expected_attributes.items() if ds.attrs[key] != val
     }
     if len(incorrect_attributes) > 0:
-        s_list = "\n".join(
+        s_list = "\n".join(  # pragma: no cover
             [
                 f"{key}: {val} != {ds.attrs[key]}"
                 for key, val in incorrect_attributes.items()
             ]
         )
-        raise ValueError(
+        raise ValueError(  # pragma: no cover
             f"Dataset {dataset_name} has the following incorrect attributes: {s_list}"
         )
 
@@ -67,11 +67,11 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
         for da in das:
             d = da.attrs.get("variables_mapping_dim", None)
             if d is None:
-                raise ValueError(
+                raise ValueError(  # pragma: no cover
                     f"Dataarray for target {target} does not have the 'variables_mapping_dim' attribute"
                 )
             if concat_dim is not None and d != concat_dim:
-                raise ValueError(
+                raise ValueError(  # pragma: no cover
                     f"Dataarrays for target {target} have different 'variables_mapping_dim' attributes: {d} != {concat_dim}"
                 )
             concat_dim = d
@@ -104,7 +104,7 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
                 " Maybe you need to give the 'feature' dimension a unique name for each target variable?"
             ) from ex
         else:
-            raise ex
+            raise ex  # pragma: no cover
     return ds
 
 
@@ -124,13 +124,13 @@ def create_dataset(config: Config):
         as defined in the config file.
     """
     if not config.schema_version in SUPPORTED_CONFIG_VERSIONS:
-        raise ValueError(
+        raise ValueError(  # pragma: no cover
             f"Unsupported schema version {config.schema_version}. Only schema versions "
             f" {', '.join(SUPPORTED_CONFIG_VERSIONS)} are supported by mllam-data-prep "
             f"v{__version__}."
         )
     if config.schema_version == "v0.2.0" and config.extra:
-        raise ValueError(
+        raise ValueError(  # pragma: no cover
             "Config schema version v0.2.0 does not support the `extra` field. Please "
             "update the schema version used in your config to v0.5.0."
         )
@@ -154,7 +154,7 @@ def create_dataset(config: Config):
         logger.info(f"Loading dataset {dataset_name} from {path}")
         try:
             ds_input = load_input_dataset(fp=path)
-        except Exception as ex:
+        except Exception as ex:  # pragma: no cover
             raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex
 
         if input_config.coord_ranges is not None:
@@ -177,7 +177,7 @@ def create_dataset(config: Config):
                 for var_name in selected_variables:
                     ds[var_name] = extract_variable(ds=ds_input, var_name=var_name)
             else:
-                raise ValueError(
+                raise ValueError(  # pragma: no cover
                     "The `variables` argument should be a list or a dictionary"
                 )
 
@@ -204,7 +204,7 @@ def create_dataset(config: Config):
         # final dataset
         missing_dims = set(output_dims) - set(dim_mapping.keys())
         if missing_dims:
-            raise ValueError(
+            raise ValueError(  # pragma: no cover
                 f"Missing dimension mapping for {missing_dims}"
                 f" for input dataset {dataset_name}, please provide"
                 " a mapping for all output dimensions by"
@@ -220,7 +220,7 @@ def create_dataset(config: Config):
                 dim_mapping=dim_mapping,
                 expected_input_var_dims=expected_input_var_dims,
             )
-        except Exception as ex:
+        except Exception as ex:  # pragma: no cover
             raise Exception(
                 f"There was an issue stacking dimensions and variables to"
                 f" produce variable {target_output_var} from dataset {dataset_name}"
@@ -323,10 +323,10 @@ def create_dataset_zarr(fp_config: Path, fp_zarr: Optional[str | Path] = None):
 
     # use zstd compression since it has a good balance of speed and compression ratio
     # https://engineering.fb.com/2016/08/31/core-infra/smaller-and-faster-data-compression-with-zstandard/
-    if Version(zarr.__version__) >= Version("3"):
+    if Version(zarr.__version__) >= Version("3"):  # pragma: no cover
         compressor = BloscCodec(cname="zstd", clevel=3, shuffle=BloscShuffle.bitshuffle)
         encoding = {v: {"compressors": compressor} for v in ds.data_vars}
-    else:
+    else:  # pragma: no cover
         compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.BITSHUFFLE)
         encoding = {v: {"compressor": compressor} for v in ds.data_vars}
 

diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py
@@ -38,7 +38,7 @@ def check_chunk_size(ds, chunks):
 
         memory_usage = total_chunk_size * bytes_per_element
 
-        if memory_usage > CHUNK_MAX_SIZE_WARNING:
+        if memory_usage > CHUNK_MAX_SIZE_WARNING:  # pragma: no cover
             logger.warning(
                 f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING / 1024**3}' GB."
             )
@@ -67,7 +67,7 @@ def chunk_dataset(ds, chunks):
     # Try chunking
     try:
         ds = ds.chunk(chunks)
-    except Exception as ex:
+    except Exception as ex:  # pragma: no cover
         raise Exception(f"Error chunking dataset: {ex}")
 
     return ds
diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py
@@ -111,7 +111,7 @@ def derive_variable(ds, derived_variable, chunking, target_dims):
 
         # Align the derived field to the output dataset dimensions (if necessary)
         derived_field = _align_derived_variable(derived_field, ds, target_dims)
-    else:
+    else:  # pragma: no cover
         raise TypeError(
             f"Expected an instance of xr.DataArray, but got {type(derived_field)}."
         )

diff --git a/mllam_data_prep/ops/derive_variable/physical_field.py b/mllam_data_prep/ops/derive_variable/physical_field.py
@@ -5,6 +5,7 @@
 of time and lat/lon location), but also of other physical fields, such as
 wind speed, which is a function of both meridional and zonal wind components.
 """
+
 import datetime
 
 import numpy as np
@@ -39,7 +40,7 @@ def calculate_toa_radiation(lat, lon, time):
     if isinstance(time, xr.DataArray):
         day = time.dt.dayofyear
         hour_utc = time.dt.hour
-    elif isinstance(time, datetime.datetime):
+    elif isinstance(time, datetime.datetime):  # pragma: no cover
         day = time.timetuple().tm_yday
         hour_utc = time.hour
     else:

diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py
@@ -2,6 +2,7 @@
 Contains functions used to derive time component fields, such as e.g. day of year
 and hour of day.
 """
+
 import datetime
 
 import numpy as np
@@ -31,7 +32,7 @@ def calculate_hour_of_day(time, component):
     # Get the hour of the day
     if isinstance(time, xr.DataArray):
         hour_of_day = time.dt.hour
-    elif isinstance(time, datetime.datetime):
+    elif isinstance(time, datetime.datetime):  # pragma: no cover
         hour_of_day = time.hour
     else:
         raise TypeError(
@@ -44,7 +45,7 @@ def calculate_hour_of_day(time, component):
         hour_of_day_encoded = np.sin((hour_of_day / 24) * 2 * np.pi)
     elif component == "cos":
         hour_of_day_encoded = np.cos((hour_of_day / 24) * 2 * np.pi)
-    else:
+    else:  # pragma: no cover
         raise ValueError(
             f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'."
             " Please update the config accordingly."
@@ -81,9 +82,9 @@ def calculate_day_of_year(time, component):
     logger.info("Calculating day of year")
 
     # Get the day of year
-    if isinstance(time, xr.DataArray):
+    if isinstance(time, xr.DataArray):  # pragma: no cover
         day_of_year = time.dt.dayofyear
-    elif isinstance(time, datetime.datetime):
+    elif isinstance(time, datetime.datetime):  # pragma: no cover
         day_of_year = time.timetuple().tm_yday
     else:
         raise TypeError(
@@ -92,22 +93,22 @@ def calculate_day_of_year(time, component):
         )
 
     # Cyclic encoding of day of year - use 366 to include leap years!
-    if component == "sin":
+    if component == "sin":  # pragma: no cover
         day_of_year_encoded = np.sin((day_of_year / 366) * 2 * np.pi)
-    elif component == "cos":
+    elif component == "cos":  # pragma: no cover
         day_of_year_encoded = np.cos((day_of_year / 366) * 2 * np.pi)
-    else:
+    else:  # pragma: no cover
         raise ValueError(
             f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'."
             " Please update the config accordingly."
         )
 
-    if isinstance(day_of_year_encoded, xr.DataArray):
+    if isinstance(day_of_year_encoded, xr.DataArray):  # pragma: no cover
         # Add attributes
         day_of_year_encoded.name = "day_of_year_" + component
         day_of_year_encoded.attrs[
             "long_name"
         ] = f"{component.capitalize()} component of cyclically encoded day of year"
         day_of_year_encoded.attrs["units"] = "1"
 
-    return day_of_year_encoded
+    return day_of_year_encoded  # pragma: no cover
diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py
@@ -19,7 +19,7 @@ def load_input_dataset(fp):
 
     try:
         ds = xr.open_zarr(fp)
-    except ValueError:
+    except ValueError:  # pragma: no cover
         ds = xr.open_dataset(fp)
 
     return ds
diff --git a/mllam_data_prep/ops/mapping.py b/mllam_data_prep/ops/mapping.py
@@ -2,7 +2,7 @@
 
 
 def _check_for_malformed_list_arg(s):
-    if isinstance(s, str) and "," in s:
+    if isinstance(s, str) and "," in s:  # pragma: no cover
         raise Exception(
             "Rather than writing `{s}` to define a list you would `[{s}]` in the config file."
         )
@@ -65,21 +65,23 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
     for arch_dim in list(dim_mapping.keys()):
         if dim_mapping[arch_dim].method == "stack_variables_by_var_name":
             variable_dim_mappings[arch_dim] = dim_mapping.pop(arch_dim)
-    if len(variable_dim_mappings) > 1:
+    if len(variable_dim_mappings) > 1:  # pragma: no cover
         raise ValueError(
             "Only one mapping which requires stacking variables"
             " into a single dataarray is allowed, found ones targeting"
             f" the following arch dimensions: {list(variable_dim_mappings.keys())}"
         )
-    elif len(variable_dim_mappings) == 0:
+    elif len(variable_dim_mappings) == 0:  # pragma: no cover
         raise Exception(
             "At least one mapping should be defined for stacking variables, i.e. uses"
             f" the method `stack_variables_by_var_name`. Current mapping is: {dim_mapping}"
         )
 
     # check that none of the variables have dims that are not in the expected_input_var_dims
     for var_name in ds.data_vars:
-        if not set(ds[var_name].dims).issubset(expected_input_var_dims):
+        if not set(ds[var_name].dims).issubset(
+            expected_input_var_dims
+        ):  # pragma: no cover
             extra_dims = set(ds[var_name].dims) - set(expected_input_var_dims)
             raise ValueError(
                 f"The variable {var_name} has dimensions {ds[var_name].dims} however the"
@@ -101,7 +103,7 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
             # dimension, this is for example used for flatting the spatial dimensions
             # into a single dimension representing the grid index
             ds = ds.stack({arch_dim: source_dims}).reset_index(arch_dim)
-        else:
+        else:  # pragma: no cover
             raise NotImplementedError(method)
 
     # Finally, we handle the stacking of variables to coordinate values. We
@@ -125,14 +127,14 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
                 name_format=name_format,
                 combined_dim_name=arch_dim,
             )
-        else:
+        else:  # pragma: no cover
             # TODO: this will have to involved xrarrays MultiIndex, but lets leave
             # this until we need it
             raise NotImplementedError(len(dims))
         # set a flag we can use later to identify which coordinate the variables
         # were mapped into
         da.attrs["variables_mapping_dim"] = arch_dim
-    except ValueError as ex:
+    except ValueError as ex:  # pragma: no cover
         raise Exception(
             f"There was an issue handling the following mapping:\n{variable_dim_map}"
             f"\n from variables {list(ds.data_vars)} and dims {list(ds.dims)}"