Climate-REF · lewisjared · May 26, 2025 · May 5, 2025 · May 5, 2025 · May 5, 2025
diff --git a/changelog/273.feature.md b/changelog/273.feature.md
@@ -0,0 +1 @@
+Implemented PMP ENSO metrics
diff --git a/conftest.py b/conftest.py
@@ -103,6 +103,9 @@ def regression_data_dir(test_data_dir) -> Path:
 
 @pytest.fixture(autouse=True, scope="session")
 def sample_data() -> None:
+    if os.environ.get("REF_TEST_DATA_DIR"):
+        logger.warning("Not fetching sample data. Using custom test data directory")
+        return
     # Downloads the sample data if it doesn't exist
     logger.disable("climate_ref_core.dataset_registry")
     fetch_sample_data(force_cleanup=False, symlink=False)

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -84,6 +84,13 @@ This defaults to the following locations:
   environment variable, if defined. (Linux)
 * `%USERPROFILE%\AppData\Local\climate_ref\Cache` (Windows)
 
+### `REF_TEST_DATA_DIR`
+
+Override the location of the test data directory.
+If this is not set, the test data directory will be inferred from the location of the test suite.
+
+If this is set, then the sample data won't be updated.
+
 ### `REF_TEST_OUTPUT`
 
 Path where the test output is stored.

diff --git a/packages/climate-ref-core/src/climate_ref_core/logging.py b/packages/climate-ref-core/src/climate_ref_core/logging.py
@@ -89,7 +89,7 @@ def initialise_logging(level: int | str, format: str, log_directory: str | Path)
         retention=10,
         level="DEBUG",
         format=VERBOSE_LOG_FORMAT,
-        colorize=True,
+        colorize=False,
     )
     logger.info("Starting REF logging")
     logger.info(f"arguments: {sys.argv}")

diff --git a/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py b/packages/climate-ref-core/tests/unit/test_dataset_registry/test_dataset_registry.py
@@ -9,6 +9,8 @@
     fetch_all_files,
 )
 
+NUM_OBS4REF_FILES = 58
+
 
 @pytest.fixture
 def fake_registry_file():
@@ -107,7 +109,7 @@ def test_fetch_all_files(mocker, tmp_path, symlink):
     registry.fetch = mocker.MagicMock(return_value=downloaded_file)
 
     fetch_all_files(registry, "obs4ref", tmp_path, symlink=symlink)
-    assert registry.fetch.call_count == 59
+    assert registry.fetch.call_count == NUM_OBS4REF_FILES
 
     expected_file = (
         tmp_path / "obs4REF/MOHC/HadISST-1-1/mon/ts/gn/v20210727/ts_mon_HadISST-1-1_PCMDI_gn_187001-201907.nc"
@@ -123,4 +125,4 @@ def test_fetch_all_files_no_output(mocker):
     registry.fetch = mocker.MagicMock()
 
     fetch_all_files(registry, "obs4ref", None)
-    assert registry.fetch.call_count == 59
+    assert registry.fetch.call_count == NUM_OBS4REF_FILES
diff --git a/packages/climate-ref-pmp/src/climate_ref_pmp/__init__.py b/packages/climate-ref-pmp/src/climate_ref_pmp/__init__.py
@@ -6,22 +6,30 @@
 
 from climate_ref_core.dataset_registry import DATASET_URL, dataset_registry_manager
 from climate_ref_core.providers import CondaDiagnosticProvider
-from climate_ref_pmp.diagnostics import AnnualCycle, ExtratropicalModesOfVariability
+from climate_ref_pmp.diagnostics import ENSO, AnnualCycle, ExtratropicalModesOfVariability
 
 __version__ = importlib.metadata.version("climate-ref-pmp")
 
 # Create the PMP diagnostics provider
 # PMP uses a conda environment to run the diagnostics
 provider = CondaDiagnosticProvider("PMP", __version__)
 
+# Annual cycle diagnostics and metrics
+provider.register(AnnualCycle())
+
+# ENSO diagnostics and metrics
+# provider.register(ENSO("ENSO_perf"))  # Assigned to ESMValTool
+provider.register(ENSO("ENSO_tel"))
+provider.register(ENSO("ENSO_proc"))
+
+# Extratropical modes of variability diagnostics and metrics
 provider.register(ExtratropicalModesOfVariability("PDO"))
 provider.register(ExtratropicalModesOfVariability("NPGO"))
 provider.register(ExtratropicalModesOfVariability("NAO"))
 provider.register(ExtratropicalModesOfVariability("NAM"))
 provider.register(ExtratropicalModesOfVariability("PNA"))
 provider.register(ExtratropicalModesOfVariability("NPO"))
 provider.register(ExtratropicalModesOfVariability("SAM"))
-provider.register(AnnualCycle())
 
 
 dataset_registry_manager.register(

diff --git a/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/__init__.py b/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/__init__.py
@@ -1,9 +1,11 @@
 """PMP diagnostics."""
 
 from climate_ref_pmp.diagnostics.annual_cycle import AnnualCycle
+from climate_ref_pmp.diagnostics.enso import ENSO
 from climate_ref_pmp.diagnostics.variability_modes import ExtratropicalModesOfVariability
 
 __all__ = [
+    "ENSO",
     "AnnualCycle",
     "ExtratropicalModesOfVariability",
 ]
diff --git a/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/enso.py b/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/enso.py
@@ -0,0 +1,245 @@
+import json
+import os
+from collections.abc import Collection, Iterable
+from typing import Any
+
+from loguru import logger
+
+from climate_ref_core.constraints import AddSupplementaryDataset
+from climate_ref_core.datasets import DatasetCollection, FacetFilter, SourceDatasetType
+from climate_ref_core.diagnostics import (
+    CommandLineDiagnostic,
+    DataRequirement,
+    ExecutionDefinition,
+    ExecutionResult,
+)
+from climate_ref_pmp.pmp_driver import _get_resource, process_json_result
+
+
+class ENSO(CommandLineDiagnostic):
+    """
+    Calculate the ENSO performance metrics for a dataset
+    """
+
+    facets = ("source_id", "member_id", "grid_label", "experiment_id", "metric", "reference_datasets")
+
+    def __init__(self, metrics_collection: str, experiments: Collection[str] = ("historical",)) -> None:
+        self.name = metrics_collection
+        self.slug = metrics_collection.lower()
+        self.metrics_collection = metrics_collection
+        self.parameter_file = "pmp_param_enso.py"
+        self.obs_sources: tuple[str, ...]
+        self.model_variables: tuple[str, ...]
+
+        if metrics_collection == "ENSO_perf":  # pragma: no cover
+            self.model_variables = ("pr", "ts", "tauu")
+            self.obs_sources = ("GPCP-Monthly-3-2", "TropFlux-1-0", "HadISST-1-1")
+        elif metrics_collection == "ENSO_tel":
+            self.model_variables = ("pr", "ts")
+            self.obs_sources = ("GPCP-Monthly-3-2", "TropFlux-1-0", "HadISST-1-1")
+        elif metrics_collection == "ENSO_proc":
+            self.model_variables = ("ts", "tauu", "hfls", "hfss", "rlds", "rlus", "rsds", "rsus")
+            self.obs_sources = (
+                "GPCP-Monthly-3-2",
+                "TropFlux-1-0",
+                "HadISST-1-1",
+                "CERES-EBAF-4-2",
+            )
+        else:
+            raise ValueError(
+                f"Unknown metrics collection: {metrics_collection}. "
+                "Valid options are: ENSO_perf, ENSO_tel, ENSO_proc"
+            )
+
+        self.data_requirements = self._get_data_requirements(experiments)
+
+    def _get_data_requirements(
+        self,
+        experiments: Collection[str] = ("historical",),
+    ) -> tuple[DataRequirement, DataRequirement]:
+        filters = [
+            FacetFilter(
+                facets={
+                    "frequency": "mon",
+                    "experiment_id": tuple(experiments),
+                    "variable_id": self.model_variables,
+                }
+            )
+        ]
+
+        return (
+            DataRequirement(
+                source_type=SourceDatasetType.obs4MIPs,
+                filters=(
+                    FacetFilter(facets={"source_id": self.obs_sources, "variable_id": self.model_variables}),
+                ),
+                group_by=("activity_id",),
+            ),
+            DataRequirement(
+                source_type=SourceDatasetType.CMIP6,
+                filters=tuple(filters),
+                group_by=("source_id", "experiment_id", "member_id", "grid_label"),
+                constraints=(
+                    AddSupplementaryDataset.from_defaults("areacella", SourceDatasetType.CMIP6),
+                    AddSupplementaryDataset.from_defaults("sftlf", SourceDatasetType.CMIP6),
+                ),
+            ),
+        )
+
+    def build_cmd(self, definition: ExecutionDefinition) -> Iterable[str]:
+        """
+        Run the diagnostic on the given configuration.
+
+        Parameters
+        ----------
+        definition : ExecutionDefinition
+            The configuration to run the diagnostic on.
+
+        Returns
+        -------
+        :
+            The result of running the diagnostic.
+        """
+        mc_name = self.metrics_collection
+
+        # ------------------------------------------------
+        # Get the input datasets information for the model
+        # ------------------------------------------------
+        input_datasets = definition.datasets[SourceDatasetType.CMIP6]
+        input_selectors = input_datasets.selector_dict()
+        source_id = input_selectors["source_id"]
+        member_id = input_selectors["member_id"]
+        experiment_id = input_selectors["experiment_id"]
+        variable_ids = set(input_datasets["variable_id"].unique()) - {"areacella", "sftlf"}
+        mod_run = f"{source_id}_{member_id}"
+
+        # We only need one entry for the model run
+        dict_mod: dict[str, dict[str, Any]] = {mod_run: {}}
+
+        def extract_variable(dc: DatasetCollection, variable: str) -> list[str]:
+            return dc.datasets[input_datasets["variable_id"] == variable]["path"].to_list()  # type: ignore
+
+        # TO DO: Get the path to the files per variable
+        for variable in variable_ids:
+            list_files = extract_variable(input_datasets, variable)
+            list_areacella = extract_variable(input_datasets, "areacella")
+            list_sftlf = extract_variable(input_datasets, "sftlf")
+
+            if len(list_files) > 0:
+                dict_mod[mod_run][variable] = {
+                    "path + filename": list_files,
+                    "varname": variable,
+                    "path + filename_area": list_areacella,
+                    "areaname": "areacella",
+                    "path + filename_landmask": list_sftlf,
+                    "landmaskname": "sftlf",
+                }
+
+        # -------------------------------------------------------
+        # Get the input datasets information for the observations
+        # -------------------------------------------------------
+        reference_dataset = definition.datasets[SourceDatasetType.obs4MIPs]
+        reference_dataset_names = reference_dataset["source_id"].unique()
+
+        dict_obs: dict[str, dict[str, Any]] = {}
+
+        # TO DO: Get the path to the files per variable and per source
+        for obs_name in reference_dataset_names:
+            dict_obs[obs_name] = {}
+            for variable in variable_ids:
+                # Get the list of files for the current variable and observation source
+                list_files = reference_dataset.datasets[
+                    (reference_dataset["variable_id"] == variable)
+                    & (reference_dataset["source_id"] == obs_name)
+                ]["path"].to_list()
+                # If the list is not empty, add it to the dictionary
+                if len(list_files) > 0:
+                    dict_obs[obs_name][variable] = {
+                        "path + filename": list_files,
+                        "varname": variable,
+                    }
+
+        # Create input directory
+        dict_datasets = {
+            "model": dict_mod,
+            "observations": dict_obs,
+            "metricsCollection": mc_name,
+            "experiment_id": experiment_id,
+        }
+
+        # Create JSON file for dictDatasets
+        json_file = os.path.join(
+            definition.output_directory, f"input_{mc_name}_{source_id}_{experiment_id}_{member_id}.json"
+        )
+        with open(json_file, "w") as f:
+            json.dump(dict_datasets, f, indent=4)
+        logger.debug(f"JSON file created: {json_file}")
+
+        driver_file = _get_resource("climate_ref_pmp.drivers", "enso_driver.py", use_resources=True)
+        return [
+            "python",
+            driver_file,
+            "--metrics_collection",
+            mc_name,
+            "--experiment_id",
+            experiment_id,
+            "--input_json_path",
+            json_file,
+            "--output_directory",
+            str(definition.output_directory),
+        ]
+
+    def build_execution_result(self, definition: ExecutionDefinition) -> ExecutionResult:
+        """
+        Build a diagnostic result from the output of the PMP driver
+
+        Parameters
+        ----------
+        definition
+            Definition of the diagnostic execution
+
+        Returns
+        -------
+            Result of the diagnostic execution
+        """
+        input_datasets = definition.datasets[SourceDatasetType.CMIP6]
+        source_id = input_datasets["source_id"].unique()[0]
+        experiment_id = input_datasets["experiment_id"].unique()[0]
+        member_id = input_datasets["member_id"].unique()[0]
+        mc_name = self.metrics_collection
+        pattern = f"{mc_name}_{source_id}_{experiment_id}_{member_id}"
+
+        # Find the results files
+        results_files = list(definition.output_directory.glob(f"{pattern}_cmec.json"))
+        logger.debug(f"Results files: {results_files}")
+
+        if len(results_files) != 1:  # pragma: no cover
+            logger.warning(f"A single cmec output file not found: {results_files}")
+            return ExecutionResult.build_from_failure(definition)
+
+        # Find the other outputs
+        png_files = [definition.as_relative_path(f) for f in definition.output_directory.glob("*.png")]
+        data_files = [definition.as_relative_path(f) for f in definition.output_directory.glob("*.nc")]
+
+        cmec_output, cmec_metric = process_json_result(results_files[0], png_files, data_files)
+
+        input_selectors = definition.datasets[SourceDatasetType.CMIP6].selector_dict()
+        cmec_metric_bundle = cmec_metric.remove_dimensions(
+            [
+                "model",
+                "realization",
+            ],
+        ).prepend_dimensions(
+            {
+                "source_id": input_selectors["source_id"],
+                "member_id": input_selectors["member_id"],
+                "grid_label": input_selectors["grid_label"],
+                "experiment_id": input_selectors["experiment_id"],
+            }
+        )
+
+        return ExecutionResult.build_from_output_bundle(
+            definition,
+            cmec_output_bundle=cmec_output,
+            cmec_metric_bundle=cmec_metric_bundle,
+        )
diff --git a/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/variability_modes.py b/packages/climate-ref-pmp/src/climate_ref_pmp/diagnostics/variability_modes.py
@@ -37,18 +37,18 @@ def __init__(self, mode_id: str):
         self.name = f"Extratropical modes of variability: {mode_id}"
         self.slug = f"extratropical-modes-of-variability-{mode_id.lower()}"
 
-        def get_data_requirements(
+        def _get_data_requirements(
             obs_source: str,
             obs_variable: str,
-            cmip_variable: str,
+            model_variable: str,
             extra_experiments: str | tuple[str, ...] | list[str] = (),
         ) -> tuple[DataRequirement, DataRequirement]:
             filters = [
                 FacetFilter(
                     facets={
                         "frequency": "mon",
                         "experiment_id": ("historical", "hist-GHG", "piControl", *extra_experiments),
-                        "variable_id": cmip_variable,
+                        "variable_id": model_variable,
                     }
                 )
             ]
@@ -70,10 +70,10 @@ def get_data_requirements(
 
         if self.mode_id in self.ts_modes:
             self.parameter_file = "pmp_param_MoV-ts.py"
-            self.data_requirements = get_data_requirements("HadISST-1-1", "ts", "ts")
+            self.data_requirements = _get_data_requirements("HadISST-1-1", "ts", "ts")
         elif self.mode_id in self.psl_modes:
             self.parameter_file = "pmp_param_MoV-psl.py"
-            self.data_requirements = get_data_requirements("20CR", "psl", "psl", extra_experiments=("amip",))
+            self.data_requirements = _get_data_requirements("20CR", "psl", "psl", extra_experiments=("amip",))
         else:
             raise ValueError(
                 f"Unknown mode_id '{self.mode_id}'. Must be one of {self.ts_modes + self.psl_modes}"