Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/214.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added a constraint to add the parent experiment.
103 changes: 85 additions & 18 deletions packages/climate-ref-core/src/climate_ref_core/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,32 +190,33 @@ def apply(
values = tuple(group[facet].unique())
supplementary_facets[facet] += values

supplementary_group = data_catalog
for facet, values in supplementary_facets.items():
mask = supplementary_group[facet].isin(values)
supplementary_group = supplementary_group[mask]
mask = data_catalog[list(supplementary_facets)].isin(supplementary_facets).all(axis="columns")
supplementary_group = data_catalog[mask]
if not supplementary_group.empty:
matching_facets = list(self.matching_facets)
facets = matching_facets + list(self.optional_matching_facets)
datasets = group[facets].drop_duplicates()
indices = set()
select = pd.Series(False, index=supplementary_group.index)
for i in range(len(datasets)):
dataset = datasets.iloc[i]
# Restrict the supplementary datasets to those that match the main dataset.
supplementaries = supplementary_group[
(supplementary_group[matching_facets] == dataset[matching_facets]).all(1)
(supplementary_group[matching_facets] == dataset[matching_facets]).all(axis="columns")
]
if not supplementaries.empty:
# Select the best matching supplementary dataset based on the optional matching facets.
scores = (supplementaries[facets] == dataset).sum(axis=1)
matches = supplementaries[scores == scores.max()]
if "version" in facets:
scores = (supplementaries[facets] == dataset).sum(axis="columns")
supplementaries = supplementaries[scores == scores.max()]
if "version" in supplementaries.columns:
# Select the latest version if there are multiple matches
matches = matches[matches["version"] == matches["version"].max()]
# Select one match per dataset
indices.add(matches.index[0])
supplementaries = supplementaries[
supplementaries["version"] == supplementaries["version"].max()
]
# Select only the first group if there are still multiple matches
first_supplementary_dataset = supplementaries[facets].drop_duplicates().iloc[0]
select |= (supplementaries[facets] == first_supplementary_dataset).all(axis="columns")

supplementary_group = supplementary_group.loc[list(indices)].drop_duplicates()
supplementary_group = supplementary_group[select]

return pd.concat([group, supplementary_group])

Expand Down Expand Up @@ -477,15 +478,81 @@ def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame


@frozen
class SelectParentExperiment:
class AddParentDataset:
"""
Include a dataset's parent experiment in the selection
Include a dataset's parent in the selection.
"""

parent_facet_map: dict[str, str]
"""
Mapping from child to parent facets.
"""

def apply(self, group: pd.DataFrame, data_catalog: pd.DataFrame) -> pd.DataFrame:
"""
Include a dataset's parent experiment in the selection
Include a dataset's parent in the selection.

Not yet implemented
"""
raise NotImplementedError("This is not implemented yet") # pragma: no cover
all_parent_facets = sorted({*self.parent_facet_map.keys(), *self.parent_facet_map.values()})

# Remove datasets that do not have all parent facets set.
valid_group = group[all_parent_facets].dropna(axis="columns")

# Add the parent datasets from the data catalog.
select = pd.Series(False, index=data_catalog.index)
select.loc[valid_group.index] = True
for _, child_dataset in valid_group.groupby(all_parent_facets):
child_facets = {facet: child_dataset[facet].unique().tolist() for facet in all_parent_facets}
parent_facets = {
parent_facet: child_facets[child_facet]
for parent_facet, child_facet in self.parent_facet_map.items()
}
parent_dataset = data_catalog[
data_catalog[list(parent_facets)].isin(parent_facets).all(axis="columns")
]
if parent_dataset.empty:
# Drop the child dataset if no parent dataset is found.
logger.debug(
f"Constraint {self} not satisfied because no parent dataset found for "
f"{', '.join(f'{k}={v}' for k, v in child_facets.items())}"
)
select.loc[child_dataset.index] = False
else:
# Add the latest version of the dataset to the selection.
parent_dataset = parent_dataset[parent_dataset["version"] == parent_dataset["version"].max()]
select.loc[parent_dataset.index] = True

return data_catalog[select]

@classmethod
def from_defaults(
cls,
source_type: SourceDatasetType,
) -> Self:
"""
Include a dataset's parent in the selection.

The constraint is created using the defaults for the source_type.

Parameters
----------
source_type:
The source_type of the variable to add.

Returns
-------
:
A constraint to include a dataset's parent in the selection.

"""
parent_facet_options = {
SourceDatasetType.CMIP6: {
"source_id": "parent_source_id",
"experiment_id": "parent_experiment_id",
"variant_label": "parent_variant_label",
"table_id": "table_id",
"variable_id": "variable_id",
"grid_label": "grid_label",
},
}
return cls(parent_facet_map=parent_facet_options[source_type])
Loading