diff --git a/.github/workflows/python-package-pip.yml b/.github/workflows/python-package-pip.yml index 14f46d3..64c874a 100644 --- a/.github/workflows/python-package-pip.yml +++ b/.github/workflows/python-package-pip.yml @@ -25,10 +25,11 @@ jobs: run: | python -m pip install . "zarr${{ matrix.zarr-version }}" python -m pip install pytest + python -m pip install pytest-coverage - name: Run tests env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | - python -m pytest tests/ + python -m pytest tests/ --cov mllam_data_prep --cov-fail-under=100 --cov-report term-missing diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py index 64bfa91..575a488 100644 --- a/mllam_data_prep/__init__.py +++ b/mllam_data_prep/__init__.py @@ -1,8 +1,10 @@ +# pragma: no cover + import importlib.metadata -try: +try: # pragma: no cover __version__ = importlib.metadata.version(__name__) -except importlib.metadata.PackageNotFoundError: +except importlib.metadata.PackageNotFoundError: # pragma: no cover __version__ = "unknown" # expose the public API diff --git a/mllam_data_prep/__main__.py b/mllam_data_prep/__main__.py index e11ffe1..d131439 100644 --- a/mllam_data_prep/__main__.py +++ b/mllam_data_prep/__main__.py @@ -1,20 +1,20 @@ -import os -from pathlib import Path +import os # pragma: no cover +from pathlib import Path # pragma: no cover -from loguru import logger +from loguru import logger # pragma: no cover -from .create_dataset import create_dataset_zarr +from .create_dataset import create_dataset_zarr # pragma: no cover # Attempt to import psutil and dask.distributed modules -DASK_DISTRIBUTED_AVAILABLE = True -try: +DASK_DISTRIBUTED_AVAILABLE = True # pragma: no cover +try: # pragma: no cover import psutil from dask.diagnostics import ProgressBar from dask.distributed import LocalCluster -except (ImportError, ModuleNotFoundError): +except (ImportError, ModuleNotFoundError): # pragma: no cover DASK_DISTRIBUTED_AVAILABLE = False -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover import argparse parser = argparse.ArgumentParser( diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 8a7ccfd..397def8 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -26,7 +26,7 @@ def validate_config(config_inputs): for input_dataset_name, input_dataset in config_inputs.items(): if not input_dataset.variables and not input_dataset.derived_variables: - raise InvalidConfigException( + raise InvalidConfigException( # pragma: no cover f"Input dataset '{input_dataset_name}' is missing the keys `variables` and/or" " `derived_variables`. Make sure that you update the config so that the input" f" dataset '{input_dataset_name}' contains at least either a `variables` or" @@ -36,16 +36,16 @@ def validate_config(config_inputs): # Check so that there are no overlapping variables if isinstance(input_dataset.variables, list): variable_vars = input_dataset.variables - elif isinstance(input_dataset.variables, dict): + elif isinstance(input_dataset.variables, dict): # pragma: no cover variable_vars = input_dataset.variables.keys() else: - raise TypeError( + raise TypeError( # pragma: no cover f"Expected an instance of list or dict, but got {type(input_dataset.variables)}." ) derived_variable_vars = input_dataset.derived_variables.keys() common_vars = list(set(variable_vars) & set(derived_variable_vars)) if len(common_vars) > 0: - raise InvalidConfigException( + raise InvalidConfigException( # pragma: no cover "Both `variables` and `derived_variables` include the following variables name(s):" f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there" " are no overlapping variable names between `variables` and `derived_variables`," @@ -382,7 +382,7 @@ class _(JSONWizard.Meta): raise_on_unknown_json_key = True -if __name__ == "__main__": +if __name__ == "__main__": # pragma: no cover import argparse argparser = argparse.ArgumentParser() diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 1718322..6a9dd7b 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -22,10 +22,10 @@ from .ops.statistics import calc_stats from .ops.subsetting import extract_variable -if Version(zarr.__version__) >= Version("3"): +if Version(zarr.__version__) >= Version("3"): # pragma: no cover from zarr.codecs import BloscCodec, BloscShuffle else: - from numcodecs import Blosc + from numcodecs import Blosc # pragma: no cover # The config versions defined in SUPPORTED_CONFIG_VERSIONS are the ones currently supported. # The `extra` field in the config that was added between v0.2.0 and v0.5.0 is optional, and @@ -38,7 +38,7 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name): # check that the dataset has the expected attributes with the expected values missing_attributes = set(expected_attributes.keys()) - set(ds.attrs.keys()) if len(missing_attributes) > 0: - raise ValueError( + raise ValueError( # pragma: no cover f"Dataset {dataset_name} is missing the following attributes: {missing_attributes}" ) @@ -47,13 +47,13 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name): key: val for key, val in expected_attributes.items() if ds.attrs[key] != val } if len(incorrect_attributes) > 0: - s_list = "\n".join( + s_list = "\n".join( # pragma: no cover [ f"{key}: {val} != {ds.attrs[key]}" for key, val in incorrect_attributes.items() ] ) - raise ValueError( + raise ValueError( # pragma: no cover f"Dataset {dataset_name} has the following incorrect attributes: {s_list}" ) @@ -67,11 +67,11 @@ def _merge_dataarrays_by_target(dataarrays_by_target): for da in das: d = da.attrs.get("variables_mapping_dim", None) if d is None: - raise ValueError( + raise ValueError( # pragma: no cover f"Dataarray for target {target} does not have the 'variables_mapping_dim' attribute" ) if concat_dim is not None and d != concat_dim: - raise ValueError( + raise ValueError( # pragma: no cover f"Dataarrays for target {target} have different 'variables_mapping_dim' attributes: {d} != {concat_dim}" ) concat_dim = d @@ -104,7 +104,7 @@ def _merge_dataarrays_by_target(dataarrays_by_target): " Maybe you need to give the 'feature' dimension a unique name for each target variable?" ) from ex else: - raise ex + raise ex # pragma: no cover return ds @@ -124,13 +124,13 @@ def create_dataset(config: Config): as defined in the config file. """ if not config.schema_version in SUPPORTED_CONFIG_VERSIONS: - raise ValueError( + raise ValueError( # pragma: no cover f"Unsupported schema version {config.schema_version}. Only schema versions " f" {', '.join(SUPPORTED_CONFIG_VERSIONS)} are supported by mllam-data-prep " f"v{__version__}." ) if config.schema_version == "v0.2.0" and config.extra: - raise ValueError( + raise ValueError( # pragma: no cover "Config schema version v0.2.0 does not support the `extra` field. Please " "update the schema version used in your config to v0.5.0." ) @@ -154,7 +154,7 @@ def create_dataset(config: Config): logger.info(f"Loading dataset {dataset_name} from {path}") try: ds_input = load_input_dataset(fp=path) - except Exception as ex: + except Exception as ex: # pragma: no cover raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex if input_config.coord_ranges is not None: @@ -177,7 +177,7 @@ def create_dataset(config: Config): for var_name in selected_variables: ds[var_name] = extract_variable(ds=ds_input, var_name=var_name) else: - raise ValueError( + raise ValueError( # pragma: no cover "The `variables` argument should be a list or a dictionary" ) @@ -204,7 +204,7 @@ def create_dataset(config: Config): # final dataset missing_dims = set(output_dims) - set(dim_mapping.keys()) if missing_dims: - raise ValueError( + raise ValueError( # pragma: no cover f"Missing dimension mapping for {missing_dims}" f" for input dataset {dataset_name}, please provide" " a mapping for all output dimensions by" @@ -220,7 +220,7 @@ def create_dataset(config: Config): dim_mapping=dim_mapping, expected_input_var_dims=expected_input_var_dims, ) - except Exception as ex: + except Exception as ex: # pragma: no cover raise Exception( f"There was an issue stacking dimensions and variables to" f" produce variable {target_output_var} from dataset {dataset_name}" @@ -323,10 +323,10 @@ def create_dataset_zarr(fp_config: Path, fp_zarr: Optional[str | Path] = None): # use zstd compression since it has a good balance of speed and compression ratio # https://engineering.fb.com/2016/08/31/core-infra/smaller-and-faster-data-compression-with-zstandard/ - if Version(zarr.__version__) >= Version("3"): + if Version(zarr.__version__) >= Version("3"): # pragma: no cover compressor = BloscCodec(cname="zstd", clevel=3, shuffle=BloscShuffle.bitshuffle) encoding = {v: {"compressors": compressor} for v in ds.data_vars} - else: + else: # pragma: no cover compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.BITSHUFFLE) encoding = {v: {"compressor": compressor} for v in ds.data_vars} diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py index 9df27e9..9408bc4 100644 --- a/mllam_data_prep/ops/chunking.py +++ b/mllam_data_prep/ops/chunking.py @@ -38,7 +38,7 @@ def check_chunk_size(ds, chunks): memory_usage = total_chunk_size * bytes_per_element - if memory_usage > CHUNK_MAX_SIZE_WARNING: + if memory_usage > CHUNK_MAX_SIZE_WARNING: # pragma: no cover logger.warning( f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING / 1024**3}' GB." ) @@ -67,7 +67,7 @@ def chunk_dataset(ds, chunks): # Try chunking try: ds = ds.chunk(chunks) - except Exception as ex: + except Exception as ex: # pragma: no cover raise Exception(f"Error chunking dataset: {ex}") return ds diff --git a/mllam_data_prep/ops/derive_variable/main.py b/mllam_data_prep/ops/derive_variable/main.py index 07fc8f9..944fbec 100644 --- a/mllam_data_prep/ops/derive_variable/main.py +++ b/mllam_data_prep/ops/derive_variable/main.py @@ -111,7 +111,7 @@ def derive_variable(ds, derived_variable, chunking, target_dims): # Align the derived field to the output dataset dimensions (if necessary) derived_field = _align_derived_variable(derived_field, ds, target_dims) - else: + else: # pragma: no cover raise TypeError( f"Expected an instance of xr.DataArray, but got {type(derived_field)}." ) diff --git a/mllam_data_prep/ops/derive_variable/physical_field.py b/mllam_data_prep/ops/derive_variable/physical_field.py index d7b9617..7829fc8 100644 --- a/mllam_data_prep/ops/derive_variable/physical_field.py +++ b/mllam_data_prep/ops/derive_variable/physical_field.py @@ -5,6 +5,7 @@ of time and lat/lon location), but also of other physical fields, such as wind speed, which is a function of both meridional and zonal wind components. """ + import datetime import numpy as np @@ -39,7 +40,7 @@ def calculate_toa_radiation(lat, lon, time): if isinstance(time, xr.DataArray): day = time.dt.dayofyear hour_utc = time.dt.hour - elif isinstance(time, datetime.datetime): + elif isinstance(time, datetime.datetime): # pragma: no cover day = time.timetuple().tm_yday hour_utc = time.hour else: diff --git a/mllam_data_prep/ops/derive_variable/time_components.py b/mllam_data_prep/ops/derive_variable/time_components.py index 5329e12..c3d29bd 100644 --- a/mllam_data_prep/ops/derive_variable/time_components.py +++ b/mllam_data_prep/ops/derive_variable/time_components.py @@ -2,6 +2,7 @@ Contains functions used to derive time component fields, such as e.g. day of year and hour of day. """ + import datetime import numpy as np @@ -31,7 +32,7 @@ def calculate_hour_of_day(time, component): # Get the hour of the day if isinstance(time, xr.DataArray): hour_of_day = time.dt.hour - elif isinstance(time, datetime.datetime): + elif isinstance(time, datetime.datetime): # pragma: no cover hour_of_day = time.hour else: raise TypeError( @@ -44,7 +45,7 @@ def calculate_hour_of_day(time, component): hour_of_day_encoded = np.sin((hour_of_day / 24) * 2 * np.pi) elif component == "cos": hour_of_day_encoded = np.cos((hour_of_day / 24) * 2 * np.pi) - else: + else: # pragma: no cover raise ValueError( f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'." " Please update the config accordingly." @@ -81,9 +82,9 @@ def calculate_day_of_year(time, component): logger.info("Calculating day of year") # Get the day of year - if isinstance(time, xr.DataArray): + if isinstance(time, xr.DataArray): # pragma: no cover day_of_year = time.dt.dayofyear - elif isinstance(time, datetime.datetime): + elif isinstance(time, datetime.datetime): # pragma: no cover day_of_year = time.timetuple().tm_yday else: raise TypeError( @@ -92,17 +93,17 @@ def calculate_day_of_year(time, component): ) # Cyclic encoding of day of year - use 366 to include leap years! - if component == "sin": + if component == "sin": # pragma: no cover day_of_year_encoded = np.sin((day_of_year / 366) * 2 * np.pi) - elif component == "cos": + elif component == "cos": # pragma: no cover day_of_year_encoded = np.cos((day_of_year / 366) * 2 * np.pi) - else: + else: # pragma: no cover raise ValueError( f"Invalid value of `component`: '{component}'. Expected one of: 'cos' or 'sin'." " Please update the config accordingly." ) - if isinstance(day_of_year_encoded, xr.DataArray): + if isinstance(day_of_year_encoded, xr.DataArray): # pragma: no cover # Add attributes day_of_year_encoded.name = "day_of_year_" + component day_of_year_encoded.attrs[ @@ -110,4 +111,4 @@ def calculate_day_of_year(time, component): ] = f"{component.capitalize()} component of cyclically encoded day of year" day_of_year_encoded.attrs["units"] = "1" - return day_of_year_encoded + return day_of_year_encoded # pragma: no cover diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index f6bfc34..c9a9b52 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -19,7 +19,7 @@ def load_input_dataset(fp): try: ds = xr.open_zarr(fp) - except ValueError: + except ValueError: # pragma: no cover ds = xr.open_dataset(fp) return ds diff --git a/mllam_data_prep/ops/mapping.py b/mllam_data_prep/ops/mapping.py index 9482ff8..6eae33d 100644 --- a/mllam_data_prep/ops/mapping.py +++ b/mllam_data_prep/ops/mapping.py @@ -2,7 +2,7 @@ def _check_for_malformed_list_arg(s): - if isinstance(s, str) and "," in s: + if isinstance(s, str) and "," in s: # pragma: no cover raise Exception( "Rather than writing `{s}` to define a list you would `[{s}]` in the config file." ) @@ -65,13 +65,13 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims): for arch_dim in list(dim_mapping.keys()): if dim_mapping[arch_dim].method == "stack_variables_by_var_name": variable_dim_mappings[arch_dim] = dim_mapping.pop(arch_dim) - if len(variable_dim_mappings) > 1: + if len(variable_dim_mappings) > 1: # pragma: no cover raise ValueError( "Only one mapping which requires stacking variables" " into a single dataarray is allowed, found ones targeting" f" the following arch dimensions: {list(variable_dim_mappings.keys())}" ) - elif len(variable_dim_mappings) == 0: + elif len(variable_dim_mappings) == 0: # pragma: no cover raise Exception( "At least one mapping should be defined for stacking variables, i.e. uses" f" the method `stack_variables_by_var_name`. Current mapping is: {dim_mapping}" @@ -79,7 +79,9 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims): # check that none of the variables have dims that are not in the expected_input_var_dims for var_name in ds.data_vars: - if not set(ds[var_name].dims).issubset(expected_input_var_dims): + if not set(ds[var_name].dims).issubset( + expected_input_var_dims + ): # pragma: no cover extra_dims = set(ds[var_name].dims) - set(expected_input_var_dims) raise ValueError( f"The variable {var_name} has dimensions {ds[var_name].dims} however the" @@ -101,7 +103,7 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims): # dimension, this is for example used for flatting the spatial dimensions # into a single dimension representing the grid index ds = ds.stack({arch_dim: source_dims}).reset_index(arch_dim) - else: + else: # pragma: no cover raise NotImplementedError(method) # Finally, we handle the stacking of variables to coordinate values. We @@ -125,14 +127,14 @@ def map_dims_and_variables(ds, dim_mapping, expected_input_var_dims): name_format=name_format, combined_dim_name=arch_dim, ) - else: + else: # pragma: no cover # TODO: this will have to involved xrarrays MultiIndex, but lets leave # this until we need it raise NotImplementedError(len(dims)) # set a flag we can use later to identify which coordinate the variables # were mapped into da.attrs["variables_mapping_dim"] = arch_dim - except ValueError as ex: + except ValueError as ex: # pragma: no cover raise Exception( f"There was an issue handling the following mapping:\n{variable_dim_map}" f"\n from variables {list(ds.data_vars)} and dims {list(ds.dims)}" diff --git a/mllam_data_prep/ops/selection.py b/mllam_data_prep/ops/selection.py index 37b91c1..55c424a 100644 --- a/mllam_data_prep/ops/selection.py +++ b/mllam_data_prep/ops/selection.py @@ -6,24 +6,24 @@ def _normalize_slice_startstop(s): - if isinstance(s, pd.Timestamp): + if isinstance(s, pd.Timestamp): # pragma: no cover return s elif isinstance(s, str): try: return pd.Timestamp(s) - except ValueError: + except ValueError: # pragma: no cover return s else: return s def _normalize_slice_step(s): - if isinstance(s, pd.Timedelta): + if isinstance(s, pd.Timedelta): # pragma: no cover return s elif isinstance(s, str): try: return pd.to_timedelta(s) - except ValueError: + except ValueError: # pragma: no cover return s else: return s @@ -56,10 +56,10 @@ def select_by_kwargs(ds, **coord_ranges): """ for coord, selection in coord_ranges.items(): - if coord not in ds.coords: + if coord not in ds.coords: # pragma: no cover raise ValueError(f"Coordinate {coord} not found in dataset") if isinstance(selection, Range): - if selection.start is None and selection.end is None: + if selection.start is None and selection.end is None: # pragma: no cover raise ValueError( f"Selection for coordinate {coord} must have either 'start' and 'end' given" ) @@ -83,9 +83,9 @@ def select_by_kwargs(ds, **coord_ranges): len(ds[coord]) > 0 ), f"You have selected an empty range {sel_start}:{sel_end} for coordinate {coord}" - elif isinstance(selection, list): + elif isinstance(selection, list): # pragma: no cover ds = ds.sel({coord: selection}) - else: + else: # pragma: no cover raise NotImplementedError( f"Selection for coordinate {coord} must be a list or a dict" ) @@ -109,7 +109,7 @@ def check_step(sel_step, coord, ds): all_steps = ds[coord].diff(dim=coord).values first_step = all_steps[0].astype("timedelta64[s]").astype(datetime.timedelta) - if not all(all_steps[0] == all_steps): + if not all(all_steps[0] == all_steps): # pragma: no cover raise ValueError( f"Step size for coordinate {coord} is not constant: {all_steps}" ) diff --git a/mllam_data_prep/ops/stacking.py b/mllam_data_prep/ops/stacking.py index a56e0fd..542777d 100644 --- a/mllam_data_prep/ops/stacking.py +++ b/mllam_data_prep/ops/stacking.py @@ -25,7 +25,7 @@ def stack_variables_as_coord_values(ds, name_format, combined_dim_name): The combined dataset with all variables stacked along the new coordinate """ - if "{var_name}" not in name_format: + if "{var_name}" not in name_format: # pragma: no cover raise ValueError( "The name_format should include the variable name as" " {var_name} to construct the new coordinate values" @@ -91,17 +91,17 @@ def stack_variables_by_coord_values(ds, coord, name_format, combined_dim_name): da_combined : xr.DataArray The combined dataset with the stacked variables along the `coord` """ - if "{var_name}" not in name_format: + if "{var_name}" not in name_format: # pragma: no cover raise ValueError( "The name_format should include the variable name as" " {var_name} to construct the new coordinate values" ) - if f"{{{coord}}}" not in name_format: + if f"{{{coord}}}" not in name_format: # pragma: no cover raise ValueError( "The name_format should include the coordinate name as" f" {{{coord}}} to construct the new coordinate values" ) - if coord not in ds.coords: + if coord not in ds.coords: # pragma: no cover raise ValueError( f"The coordinate {coord} is not in the dataset, found coords: {list(ds.coords)}" ) diff --git a/mllam_data_prep/ops/statistics.py b/mllam_data_prep/ops/statistics.py index 10031c2..c18fa2f 100644 --- a/mllam_data_prep/ops/statistics.py +++ b/mllam_data_prep/ops/statistics.py @@ -44,7 +44,7 @@ def calc_stats( vars_to_keep = [v for v in ds.data_vars if splitting_dim in ds[v].dims] ds = ds[vars_to_keep].diff(dim=splitting_dim) else: - raise NotImplementedError(pre_op) + raise NotImplementedError(pre_op) # pragma: no cover fn = getattr(ds, op) stats[op_split] = fn(dim=statistics_config.dims) diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index 80f2ce1..6dcadde 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -24,7 +24,7 @@ def extract_variable(ds, var_name, coords_to_sample=dict()): try: da = ds[var_name] - except KeyError as ex: + except KeyError as ex: # pragma: no cover raise KeyError( f"Could not find the variable `{var_name}` in the dataset. " f"The available variables are {list(ds.data_vars)}" @@ -34,14 +34,16 @@ def extract_variable(ds, var_name, coords_to_sample=dict()): coord_values = sampling.values try: da = da.sel(**{coord: coord_values}) - except KeyError as ex: + except KeyError as ex: # pragma: no cover raise KeyError( f"Could not find the all coordinate values `{coord_values}` in " f"coordinate `{coord}` in the dataset" ) from ex expected_units = sampling.units coord_units = da[coord].attrs.get("units", None) - if coord_units is not None and coord_units != expected_units: + if ( + coord_units is not None and coord_units != expected_units + ): # pragma: no cover raise ValueError( f"Expected units {expected_units} for coordinate {coord}" f" in variable {var_name} but got {coord_units}"