Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/python-package-pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ jobs:
run: |
python -m pip install . "zarr${{ matrix.zarr-version }}"
python -m pip install pytest
python -m pip install pytest-coverage

- name: Run tests
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
python -m pytest tests/
python -m pytest tests/ --cov mllam_data_prep --cov-fail-under=100 --cov-report term-missing
6 changes: 4 additions & 2 deletions mllam_data_prep/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# pragma: no cover

import importlib.metadata

try:
try: # pragma: no cover
__version__ = importlib.metadata.version(__name__)
except importlib.metadata.PackageNotFoundError:
except importlib.metadata.PackageNotFoundError: # pragma: no cover
__version__ = "unknown"

# expose the public API
Expand Down
16 changes: 8 additions & 8 deletions mllam_data_prep/__main__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
import os
from pathlib import Path
import os # pragma: no cover
from pathlib import Path # pragma: no cover

from loguru import logger
from loguru import logger # pragma: no cover

from .create_dataset import create_dataset_zarr
from .create_dataset import create_dataset_zarr # pragma: no cover

# Attempt to import psutil and dask.distributed modules
DASK_DISTRIBUTED_AVAILABLE = True
try:
DASK_DISTRIBUTED_AVAILABLE = True # pragma: no cover
try: # pragma: no cover
import psutil
from dask.diagnostics import ProgressBar
from dask.distributed import LocalCluster
except (ImportError, ModuleNotFoundError):
except (ImportError, ModuleNotFoundError): # pragma: no cover
DASK_DISTRIBUTED_AVAILABLE = False

if __name__ == "__main__":
if __name__ == "__main__": # pragma: no cover
import argparse

parser = argparse.ArgumentParser(
Expand Down
10 changes: 5 additions & 5 deletions mllam_data_prep/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def validate_config(config_inputs):

for input_dataset_name, input_dataset in config_inputs.items():
if not input_dataset.variables and not input_dataset.derived_variables:
raise InvalidConfigException(
raise InvalidConfigException( # pragma: no cover
f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or"
" `derived_variables`. Make sure that you update the config so that the input"
f" dataset '{input_dataset_name}' contains at least either a `variables` or"
Expand All @@ -36,16 +36,16 @@ def validate_config(config_inputs):
# Check so that there are no overlapping variables
if isinstance(input_dataset.variables, list):
variable_vars = input_dataset.variables
elif isinstance(input_dataset.variables, dict):
elif isinstance(input_dataset.variables, dict): # pragma: no cover
variable_vars = input_dataset.variables.keys()
else:
raise TypeError(
raise TypeError( # pragma: no cover
f"Expected an instance of list or dict, but got {type(input_dataset.variables)}."
)
derived_variable_vars = input_dataset.derived_variables.keys()
common_vars = list(set(variable_vars) & set(derived_variable_vars))
if len(common_vars) > 0:
raise InvalidConfigException(
raise InvalidConfigException( # pragma: no cover
"Both `variables` and `derived_variables` include the following variables name(s):"
f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there"
" are no overlapping variable names between `variables` and `derived_variables`,"
Expand Down Expand Up @@ -382,7 +382,7 @@ class _(JSONWizard.Meta):
raise_on_unknown_json_key = True


if __name__ == "__main__":
if __name__ == "__main__": # pragma: no cover
import argparse

argparser = argparse.ArgumentParser()
Expand Down
32 changes: 16 additions & 16 deletions mllam_data_prep/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
from .ops.statistics import calc_stats
from .ops.subsetting import extract_variable

if Version(zarr.__version__) >= Version("3"):
if Version(zarr.__version__) >= Version("3"): # pragma: no cover
from zarr.codecs import BloscCodec, BloscShuffle
else:
from numcodecs import Blosc
from numcodecs import Blosc # pragma: no cover

# The config versions defined in SUPPORTED_CONFIG_VERSIONS are the ones currently supported.
# The `extra` field in the config that was added between v0.2.0 and v0.5.0 is optional, and
Expand All @@ -38,7 +38,7 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name):
# check that the dataset has the expected attributes with the expected values
missing_attributes = set(expected_attributes.keys()) - set(ds.attrs.keys())
if len(missing_attributes) > 0:
raise ValueError(
raise ValueError( # pragma: no cover
f"Dataset {dataset_name} is missing the following attributes: {missing_attributes}"
)

Expand All @@ -47,13 +47,13 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name):
key: val for key, val in expected_attributes.items() if ds.attrs[key] != val
}
if len(incorrect_attributes) > 0:
s_list = "\n".join(
s_list = "\n".join( # pragma: no cover
[
f"{key}: {val} != {ds.attrs[key]}"
for key, val in incorrect_attributes.items()
]
)
raise ValueError(
raise ValueError( # pragma: no cover
f"Dataset {dataset_name} has the following incorrect attributes: {s_list}"
)

Expand All @@ -67,11 +67,11 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
for da in das:
d = da.attrs.get("variables_mapping_dim", None)
if d is None:
raise ValueError(
raise ValueError( # pragma: no cover
f"Dataarray for target {target} does not have the 'variables_mapping_dim' attribute"
)
if concat_dim is not None and d != concat_dim:
raise ValueError(
raise ValueError( # pragma: no cover
f"Dataarrays for target {target} have different 'variables_mapping_dim' attributes: {d} != {concat_dim}"
)
concat_dim = d
Expand Down Expand Up @@ -104,7 +104,7 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
" Maybe you need to give the 'feature' dimension a unique name for each target variable?"
) from ex
else:
raise ex
raise ex # pragma: no cover
return ds


Expand All @@ -124,13 +124,13 @@ def create_dataset(config: Config):
as defined in the config file.
"""
if not config.schema_version in SUPPORTED_CONFIG_VERSIONS:
raise ValueError(
raise ValueError( # pragma: no cover
f"Unsupported schema version {config.schema_version}. Only schema versions "
f" {', '.join(SUPPORTED_CONFIG_VERSIONS)} are supported by mllam-data-prep "
f"v{__version__}."
)
if config.schema_version == "v0.2.0" and config.extra:
raise ValueError(
raise ValueError( # pragma: no cover
"Config schema version v0.2.0 does not support the `extra` field. Please "
"update the schema version used in your config to v0.5.0."
)
Expand All @@ -154,7 +154,7 @@ def create_dataset(config: Config):
logger.info(f"Loading dataset {dataset_name} from {path}")
try:
ds_input = load_input_dataset(fp=path)
except Exception as ex:
except Exception as ex: # pragma: no cover
raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex

if input_config.coord_ranges is not None:
Expand All @@ -177,7 +177,7 @@ def create_dataset(config: Config):
for var_name in selected_variables:
ds[var_name] = extract_variable(ds=ds_input, var_name=var_name)
else:
raise ValueError(
raise ValueError( # pragma: no cover
"The `variables` argument should be a list or a dictionary"
)

Expand All @@ -204,7 +204,7 @@ def create_dataset(config: Config):
# final dataset
missing_dims = set(output_dims) - set(dim_mapping.keys())
if missing_dims:
raise ValueError(
raise ValueError( # pragma: no cover
f"Missing dimension mapping for {missing_dims}"
f" for input dataset {dataset_name}, please provide"
" a mapping for all output dimensions by"
Expand All @@ -220,7 +220,7 @@ def create_dataset(config: Config):
dim_mapping=dim_mapping,
expected_input_var_dims=expected_input_var_dims,
)
except Exception as ex:
except Exception as ex: # pragma: no cover
raise Exception(
f"There was an issue stacking dimensions and variables to"
f" produce variable {target_output_var} from dataset {dataset_name}"
Expand Down Expand Up @@ -323,10 +323,10 @@ def create_dataset_zarr(fp_config: Path, fp_zarr: Optional[str | Path] = None):

# use zstd compression since it has a good balance of speed and compression ratio
# https://engineering.fb.com/2016/08/31/core-infra/smaller-and-faster-data-compression-with-zstandard/
if Version(zarr.__version__) >= Version("3"):
if Version(zarr.__version__) >= Version("3"): # pragma: no cover
compressor = BloscCodec(cname="zstd", clevel=3, shuffle=BloscShuffle.bitshuffle)
encoding = {v: {"compressors": compressor} for v in ds.data_vars}
else:
else: # pragma: no cover
compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.BITSHUFFLE)
encoding = {v: {"compressor": compressor} for v in ds.data_vars}

Expand Down
4 changes: 2 additions & 2 deletions mllam_data_prep/ops/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def check_chunk_size(ds, chunks):

memory_usage = total_chunk_size * bytes_per_element

if memory_usage > CHUNK_MAX_SIZE_WARNING:
if memory_usage > CHUNK_MAX_SIZE_WARNING: # pragma: no cover
logger.warning(
f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING / 1024**3}' GB."
)
Expand Down Expand Up @@ -67,7 +67,7 @@ def chunk_dataset(ds, chunks):
# Try chunking
try:
ds = ds.chunk(chunks)
except Exception as ex:
except Exception as ex: # pragma: no cover
raise Exception(f"Error chunking dataset: {ex}")

return ds
2 changes: 1 addition & 1 deletion mllam_data_prep/ops/derive_variable/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def derive_variable(ds, derived_variable, chunking, target_dims):

# Align the derived field to the output dataset dimensions (if necessary)
derived_field = _align_derived_variable(derived_field, ds, target_dims)
else:
else: # pragma: no cover
raise TypeError(
f"Expected an instance of xr.DataArray, but got {type(derived_field)}."
)
Expand Down
3 changes: 2 additions & 1 deletion mllam_data_prep/ops/derive_variable/physical_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
of time and lat/lon location), but also of other physical fields, such as
wind speed, which is a function of both meridional and zonal wind components.
"""

import datetime

import numpy as np
Expand Down Expand Up @@ -39,7 +40,7 @@ def calculate_toa_radiation(lat, lon, time):
if isinstance(time, xr.DataArray):
day = time.dt.dayofyear
hour_utc = time.dt.hour
elif isinstance(time, datetime.datetime):
elif isinstance(time, datetime.datetime): # pragma: no cover
day = time.timetuple().tm_yday
hour_utc = time.hour
else:
Expand Down
19 changes: 10 additions & 9 deletions mllam_data_prep/ops/derive_variable/time_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Contains functions used to derive time component fields, such as e.g. day of year
and hour of day.
"""

import datetime

import numpy as np
Expand Down Expand Up @@ -31,7 +32,7 @@ def calculate_hour_of_day(time, component):
# Get the hour of the day
if isinstance(time, xr.DataArray):
hour_of_day = time.dt.hour
elif isinstance(time, datetime.datetime):
elif isinstance(time, datetime.datetime): # pragma: no cover
hour_of_day = time.hour
else:
raise TypeError(
Expand All @@ -44,7 +45,7 @@ def calculate_hour_of_day(time, component):
hour_of_day_encoded = np.sin((hour_of_day / 24) * 2 * np.pi)
elif component == "cos":
hour_of_day_encoded = np.cos((hour_of_day / 24) * 2 * np.pi)
else:
else: # pragma: no cover
raise ValueError(
f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'."
" Please update the config accordingly."
Expand Down Expand Up @@ -81,9 +82,9 @@ def calculate_day_of_year(time, component):
logger.info("Calculating day of year")

# Get the day of year
if isinstance(time, xr.DataArray):
if isinstance(time, xr.DataArray): # pragma: no cover
day_of_year = time.dt.dayofyear
elif isinstance(time, datetime.datetime):
elif isinstance(time, datetime.datetime): # pragma: no cover
day_of_year = time.timetuple().tm_yday
else:
raise TypeError(
Expand All @@ -92,22 +93,22 @@ def calculate_day_of_year(time, component):
)

# Cyclic encoding of day of year - use 366 to include leap years!
if component == "sin":
if component == "sin": # pragma: no cover
day_of_year_encoded = np.sin((day_of_year / 366) * 2 * np.pi)
elif component == "cos":
elif component == "cos": # pragma: no cover
day_of_year_encoded = np.cos((day_of_year / 366) * 2 * np.pi)
else:
else: # pragma: no cover
raise ValueError(
f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'."
" Please update the config accordingly."
)

if isinstance(day_of_year_encoded, xr.DataArray):
if isinstance(day_of_year_encoded, xr.DataArray): # pragma: no cover
# Add attributes
day_of_year_encoded.name = "day_of_year_" + component
day_of_year_encoded.attrs[
"long_name"
] = f"{component.capitalize()} component of cyclically encoded day of year"
day_of_year_encoded.attrs["units"] = "1"

return day_of_year_encoded
return day_of_year_encoded # pragma: no cover
2 changes: 1 addition & 1 deletion mllam_data_prep/ops/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def load_input_dataset(fp):

try:
ds = xr.open_zarr(fp)
except ValueError:
except ValueError: # pragma: no cover
ds = xr.open_dataset(fp)

return ds
16 changes: 9 additions & 7 deletions mllam_data_prep/ops/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


def _check_for_malformed_list_arg(s):
if isinstance(s, str) and "," in s:
if isinstance(s, str) and "," in s: # pragma: no cover
raise Exception(
"Rather than writing `{s}` to define a list you would `[{s}]` in the config file."
)
Expand Down Expand Up @@ -65,21 +65,23 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
for arch_dim in list(dim_mapping.keys()):
if dim_mapping[arch_dim].method == "stack_variables_by_var_name":
variable_dim_mappings[arch_dim] = dim_mapping.pop(arch_dim)
if len(variable_dim_mappings) > 1:
if len(variable_dim_mappings) > 1: # pragma: no cover
raise ValueError(
"Only one mapping which requires stacking variables"
" into a single dataarray is allowed, found ones targeting"
f" the following arch dimensions: {list(variable_dim_mappings.keys())}"
)
elif len(variable_dim_mappings) == 0:
elif len(variable_dim_mappings) == 0: # pragma: no cover
raise Exception(
"At least one mapping should be defined for stacking variables, i.e. uses"
f" the method `stack_variables_by_var_name`. Current mapping is: {dim_mapping}"
)

# check that none of the variables have dims that are not in the expected_input_var_dims
for var_name in ds.data_vars:
if not set(ds[var_name].dims).issubset(expected_input_var_dims):
if not set(ds[var_name].dims).issubset(
expected_input_var_dims
): # pragma: no cover
extra_dims = set(ds[var_name].dims) - set(expected_input_var_dims)
raise ValueError(
f"The variable {var_name} has dimensions {ds[var_name].dims} however the"
Expand All @@ -101,7 +103,7 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
# dimension, this is for example used for flatting the spatial dimensions
# into a single dimension representing the grid index
ds = ds.stack({arch_dim: source_dims}).reset_index(arch_dim)
else:
else: # pragma: no cover
raise NotImplementedError(method)

# Finally, we handle the stacking of variables to coordinate values. We
Expand All @@ -125,14 +127,14 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims):
name_format=name_format,
combined_dim_name=arch_dim,
)
else:
else: # pragma: no cover
# TODO: this will have to involved xrarrays MultiIndex, but lets leave
# this until we need it
raise NotImplementedError(len(dims))
# set a flag we can use later to identify which coordinate the variables
# were mapped into
da.attrs["variables_mapping_dim"] = arch_dim
except ValueError as ex:
except ValueError as ex: # pragma: no cover
raise Exception(
f"There was an issue handling the following mapping:\n{variable_dim_map}"
f"\n from variables {list(ds.data_vars)} and dims {list(ds.dims)}"
Expand Down
Loading