From 5fd4443a02aef7ccb7b96e4d317a012ebedd12e2 Mon Sep 17 00:00:00 2001 From: danielgafni Date: Fri, 31 Oct 2025 01:34:40 +0200 Subject: [PATCH 01/73] :bug: remove code_version constructor argument from FeatureSpec --- src/metaxy/graph/diff/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/metaxy/graph/diff/models.py b/src/metaxy/graph/diff/models.py index b1822cadb..193881beb 100644 --- a/src/metaxy/graph/diff/models.py +++ b/src/metaxy/graph/diff/models.py @@ -277,7 +277,6 @@ def from_feature_graph(cls, graph: "FeatureGraph") -> "GraphData": field_node = FieldNode( key=field_spec.key, version=field_version, - code_version=field_spec.code_version, status=NodeStatus.NORMAL, ) field_nodes.append(field_node) From ef1c6026c911523b7c907592527eb28dea4a3bd7 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 06:59:24 +0100 Subject: [PATCH 02/73] fix: it looks like this field was missing/went missing --- src/metaxy/graph/diff/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/metaxy/graph/diff/models.py b/src/metaxy/graph/diff/models.py index 193881beb..b1822cadb 100644 --- a/src/metaxy/graph/diff/models.py +++ b/src/metaxy/graph/diff/models.py @@ -277,6 +277,7 @@ def from_feature_graph(cls, graph: "FeatureGraph") -> "GraphData": field_node = FieldNode( key=field_spec.key, version=field_version, + code_version=field_spec.code_version, status=NodeStatus.NORMAL, ) field_nodes.append(field_node) From 9db71af8249db8470951a8deaa55f476be7dcbf8 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 20:29:16 +0100 Subject: [PATCH 03/73] fix: refine implementation --- src/metaxy/models/feature.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..a52fe5985 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,6 +82,17 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash + + class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From e47769371669ff81af7a83887e8d133f1d7f7c87 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 23:04:25 +0100 Subject: [PATCH 04/73] fix: type --- docs/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/index.md b/docs/index.md index a86f345b6..8aa24d2fc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -106,6 +106,10 @@ When `Video` changes, Metaxy automatically identifies that `VoiceDetection` requ Every feature definition produces a deterministic version hash computed from its dependencies, fields, and code versions. When you modify a feature—whether changing its dependencies, adding fields, or updating transformation logic, Metaxy detects the change and propagates it downstream. This is done on multiple levels: `Feature` (class) level, field (class attribute) level, and of course on row level: each _sample_ in the metadata store tracks the version of _each field_ and the overall (class-level) feature version. +### Code vs Feature Versions + +`Feature.code_version` only looks at a feature's own fields. It hashes their keys and `code_version` values (in sorted order) and ignores the dependency graph entirely. Use it to answer _"did my feature's logic change?"_. In contrast, `Feature.feature_version()` includes both the local fields and every dependency, so it changes whenever parent features evolve. Checking both hashes lets you distinguish between local code updates and upstream changes. + This ensures that when feature definitions evolve, every feature that transitively depends on it can be systematically updated. Because Metaxy supports declaring dependencies on fields, it can identify when a feature _does not_ require recomputation, even if one of its parents has been changed (but only irrelevant fields did). This is a huge factor in improving efficiency and reducing unnecessary computations (and costs!). Because Metaxy feature graphs are static, Metaxy can calculate data version changes ahead of the actual computation. This enables patterns such as **computation preview** and **computation cost prediction**. From 1a4d02e69c47d15af2b7d8dd5f90206a4b1c6c19 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:07:47 +0100 Subject: [PATCH 05/73] chore: cleanup --- src/metaxy/models/feature.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index a52fe5985..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,17 +82,6 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash - - class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 646f2bab6b42b356cb9c737e7b4d0c3fe0a76f2d Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:12:05 +0100 Subject: [PATCH 06/73] chore: cleanup --- src/metaxy/models/feature.py | 1 + src/metaxy/utils/typing.py | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 src/metaxy/utils/typing.py diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..175379ec9 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -19,6 +19,7 @@ from metaxy.models.plan import FeaturePlan, FQFieldKey from metaxy.models.types import FeatureKey from metaxy.utils.hashing import truncate_hash +from metaxy.utils.typing import _CodeVersionDescriptor if TYPE_CHECKING: import narwhals as nw diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/typing.py new file mode 100644 index 000000000..9d6fdb135 --- /dev/null +++ b/src/metaxy/utils/typing.py @@ -0,0 +1,9 @@ +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance, owner) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash From 0dcc8d3519b3406d9a7eaec88828bbb83280fc95 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:37:42 +0100 Subject: [PATCH 07/73] fix: update snapshots after merge --- src/metaxy/models/feature_spec.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 4ac81e87e..08fdc2002 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -285,9 +285,8 @@ def __init__(self, key: CoercibleToFeatureKey | Self, **kwargs): def fields_by_key(self) -> Mapping[FieldKey, FieldSpec]: return {c.key: c for c in self.fields} - @cached_property - def code_version(self) -> str: - """Hash of this feature's field code_versions only (no dependencies).""" + def _compute_field_code_version_hash(self) -> str: + """Compute a stable hash based solely on field code versions.""" hasher = hashlib.sha256() # Sort fields by key for deterministic ordering @@ -299,6 +298,16 @@ def code_version(self) -> str: return truncate_hash(hasher.hexdigest()) + @cached_property + def field_code_version_hash(self) -> str: + """Hash of this feature's field code_versions only (no dependencies).""" + return self._compute_field_code_version_hash() + + @cached_property + def code_version(self) -> str: + """Backward-compatible alias for the field-only code version hash.""" + return self.field_code_version_hash + def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.key.table_name From ac6db530e44f9f31d0cb8f92da57d4309cb1da9e Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 13:11:43 +0100 Subject: [PATCH 08/73] chore: cleanup --- src/metaxy/models/feature_spec.py | 15 +++------------ src/metaxy/utils/typing.py | 2 +- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 08fdc2002..1d43af368 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -285,8 +285,9 @@ def __init__(self, key: CoercibleToFeatureKey | Self, **kwargs): def fields_by_key(self) -> Mapping[FieldKey, FieldSpec]: return {c.key: c for c in self.fields} - def _compute_field_code_version_hash(self) -> str: - """Compute a stable hash based solely on field code versions.""" + @cached_property + def code_version(self) -> str: + """Hash based solely on this feature's field code versions.""" hasher = hashlib.sha256() # Sort fields by key for deterministic ordering @@ -298,16 +299,6 @@ def _compute_field_code_version_hash(self) -> str: return truncate_hash(hasher.hexdigest()) - @cached_property - def field_code_version_hash(self) -> str: - """Hash of this feature's field code_versions only (no dependencies).""" - return self._compute_field_code_version_hash() - - @cached_property - def code_version(self) -> str: - """Backward-compatible alias for the field-only code version hash.""" - return self.field_code_version_hash - def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.key.table_name diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/typing.py index 9d6fdb135..5ef6d57a6 100644 --- a/src/metaxy/utils/typing.py +++ b/src/metaxy/utils/typing.py @@ -6,4 +6,4 @@ def __get__(self, instance, owner) -> str: raise ValueError( f"Feature '{owner.__name__}' has no spec; cannot compute code_version." ) - return owner.spec.field_code_version_hash + return owner.spec.code_version From 905cfa67688d98b1263deba8615eb242f4302418 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 13:44:08 +0100 Subject: [PATCH 09/73] chore: slides --- docs/index.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index 8aa24d2fc..a86f345b6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -106,10 +106,6 @@ When `Video` changes, Metaxy automatically identifies that `VoiceDetection` requ Every feature definition produces a deterministic version hash computed from its dependencies, fields, and code versions. When you modify a feature—whether changing its dependencies, adding fields, or updating transformation logic, Metaxy detects the change and propagates it downstream. This is done on multiple levels: `Feature` (class) level, field (class attribute) level, and of course on row level: each _sample_ in the metadata store tracks the version of _each field_ and the overall (class-level) feature version. -### Code vs Feature Versions - -`Feature.code_version` only looks at a feature's own fields. It hashes their keys and `code_version` values (in sorted order) and ignores the dependency graph entirely. Use it to answer _"did my feature's logic change?"_. In contrast, `Feature.feature_version()` includes both the local fields and every dependency, so it changes whenever parent features evolve. Checking both hashes lets you distinguish between local code updates and upstream changes. - This ensures that when feature definitions evolve, every feature that transitively depends on it can be systematically updated. Because Metaxy supports declaring dependencies on fields, it can identify when a feature _does not_ require recomputation, even if one of its parents has been changed (but only irrelevant fields did). This is a huge factor in improving efficiency and reducing unnecessary computations (and costs!). Because Metaxy feature graphs are static, Metaxy can calculate data version changes ahead of the actual computation. This enables patterns such as **computation preview** and **computation cost prediction**. From fe56b404dbb4303aa64ec8d0d79da0ea064c32f5 Mon Sep 17 00:00:00 2001 From: geoHeil <1694964+geoHeil@users.noreply.github.com> Date: Fri, 31 Oct 2025 13:19:44 +0100 Subject: [PATCH 10/73] Update feature_spec.py --- src/metaxy/models/feature_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 1d43af368..4ac81e87e 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -287,7 +287,7 @@ def fields_by_key(self) -> Mapping[FieldKey, FieldSpec]: @cached_property def code_version(self) -> str: - """Hash based solely on this feature's field code versions.""" + """Hash of this feature's field code_versions only (no dependencies).""" hasher = hashlib.sha256() # Sort fields by key for deterministic ordering From c4e16f2b3a596557235af6cd2d5bd6470f3815f5 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 13:48:11 +0100 Subject: [PATCH 11/73] chore: rename --- src/metaxy/models/feature.py | 2 +- src/metaxy/utils/{typing.py => code_version_descriptor.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/metaxy/utils/{typing.py => code_version_descriptor.py} (100%) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index 175379ec9..e13101cbf 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -18,8 +18,8 @@ ) from metaxy.models.plan import FeaturePlan, FQFieldKey from metaxy.models.types import FeatureKey +from metaxy.utils.code_version_descriptor import _CodeVersionDescriptor from metaxy.utils.hashing import truncate_hash -from metaxy.utils.typing import _CodeVersionDescriptor if TYPE_CHECKING: import narwhals as nw diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/code_version_descriptor.py similarity index 100% rename from src/metaxy/utils/typing.py rename to src/metaxy/utils/code_version_descriptor.py From 20f58f91244c3572d0dfcfaa4aa3f2ad31828557 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 13:56:35 +0100 Subject: [PATCH 12/73] fix: lint --- src/metaxy/models/feature.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index e13101cbf..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -18,7 +18,6 @@ ) from metaxy.models.plan import FeaturePlan, FQFieldKey from metaxy.models.types import FeatureKey -from metaxy.utils.code_version_descriptor import _CodeVersionDescriptor from metaxy.utils.hashing import truncate_hash if TYPE_CHECKING: From aefa0c008afb11cc7065c338d29fa2eccee0b48a Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 14:01:21 +0100 Subject: [PATCH 13/73] chore: cleanup --- src/metaxy/utils/code_version_descriptor.py | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 src/metaxy/utils/code_version_descriptor.py diff --git a/src/metaxy/utils/code_version_descriptor.py b/src/metaxy/utils/code_version_descriptor.py deleted file mode 100644 index 5ef6d57a6..000000000 --- a/src/metaxy/utils/code_version_descriptor.py +++ /dev/null @@ -1,9 +0,0 @@ -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance, owner) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.code_version From ba3117c52eda54bce192ab7185061df34b79f9d1 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 20:29:16 +0100 Subject: [PATCH 14/73] fix: refine implementation --- src/metaxy/models/feature.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..a52fe5985 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,6 +82,17 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash + + class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 8d5862d25c17526738d1583d0fdf9108c5e05631 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 23:04:25 +0100 Subject: [PATCH 15/73] fix: type --- docs/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/index.md b/docs/index.md index a86f345b6..8aa24d2fc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -106,6 +106,10 @@ When `Video` changes, Metaxy automatically identifies that `VoiceDetection` requ Every feature definition produces a deterministic version hash computed from its dependencies, fields, and code versions. When you modify a feature—whether changing its dependencies, adding fields, or updating transformation logic, Metaxy detects the change and propagates it downstream. This is done on multiple levels: `Feature` (class) level, field (class attribute) level, and of course on row level: each _sample_ in the metadata store tracks the version of _each field_ and the overall (class-level) feature version. +### Code vs Feature Versions + +`Feature.code_version` only looks at a feature's own fields. It hashes their keys and `code_version` values (in sorted order) and ignores the dependency graph entirely. Use it to answer _"did my feature's logic change?"_. In contrast, `Feature.feature_version()` includes both the local fields and every dependency, so it changes whenever parent features evolve. Checking both hashes lets you distinguish between local code updates and upstream changes. + This ensures that when feature definitions evolve, every feature that transitively depends on it can be systematically updated. Because Metaxy supports declaring dependencies on fields, it can identify when a feature _does not_ require recomputation, even if one of its parents has been changed (but only irrelevant fields did). This is a huge factor in improving efficiency and reducing unnecessary computations (and costs!). Because Metaxy feature graphs are static, Metaxy can calculate data version changes ahead of the actual computation. This enables patterns such as **computation preview** and **computation cost prediction**. From 6f224bf797abb4ff37cc624b45b85fbdb3865ad0 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:07:47 +0100 Subject: [PATCH 16/73] chore: cleanup --- src/metaxy/models/feature.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index a52fe5985..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,17 +82,6 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash - - class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 41f03312133d0638f3e98b47466989db5807877f Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:12:05 +0100 Subject: [PATCH 17/73] chore: cleanup --- src/metaxy/models/feature.py | 1 + src/metaxy/utils/typing.py | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 src/metaxy/utils/typing.py diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..175379ec9 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -19,6 +19,7 @@ from metaxy.models.plan import FeaturePlan, FQFieldKey from metaxy.models.types import FeatureKey from metaxy.utils.hashing import truncate_hash +from metaxy.utils.typing import _CodeVersionDescriptor if TYPE_CHECKING: import narwhals as nw diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/typing.py new file mode 100644 index 000000000..9d6fdb135 --- /dev/null +++ b/src/metaxy/utils/typing.py @@ -0,0 +1,9 @@ +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance, owner) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash From baf3b9dc37eccfb3dda98bfeea9eaa7e4e229d82 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 21:57:09 +0100 Subject: [PATCH 18/73] feat: #66 Add metadata parameter to FeatureSpec for user-defined information https://github.com/anam-org/metaxy/issues/66 --- CLAUDE.md | 40 ++ src/metaxy/models/feature_spec.py | 26 ++ tests/__snapshots__/test_spec_version.ambr | 1 + tests/test_feature_metadata.py | 413 +++++++++++++++++++++ 4 files changed, 480 insertions(+) create mode 100644 tests/test_feature_metadata.py diff --git a/CLAUDE.md b/CLAUDE.md index 735ecb60b..85aad3f9a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -270,6 +270,46 @@ Features can override `load_input()` for custom join logic: This is critical for migrations when upstream dependencies change. +#### User-Defined Metadata +Features can include user-defined metadata for documentation and tooling purposes: +- **metadata parameter**: Optional dict on FeatureSpec for attaching arbitrary information +- **No effect on versioning**: metadata does NOT affect `feature_version()` or `code_version()` +- **Affects spec version**: metadata IS included in `feature_spec_version` for audit trail +- **Must be JSON-serializable**: Validated at initialization +- **Use cases**: Owner, team, SLA, description, tags, custom configuration + +Example: +```python +class CustomerFeature(Feature, spec=FeatureSpec( + key=FeatureKey(["customer"]), + deps=[FeatureDep(key=FeatureKey(["user"]))], + fields=[ + FieldSpec(key=FieldKey(["age"]), code_version=1), + FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), + ], + metadata={ + "owner": "data-team", + "sla": "24h", + "description": "Customer profile enrichment", + "tags": ["customer", "profile", "enrichment"], + "pii": True, + "custom_config": { + "refresh_interval": "1h", + "alert_threshold": 0.95, + } + } +)): + pass + +# Access metadata +CustomerFeature.spec.metadata["owner"] # "data-team" + +# Metadata doesn't affect versioning (these are the same): +feature1 = Feature(spec=FeatureSpec(..., metadata={"owner": "team-a"})) +feature2 = Feature(spec=FeatureSpec(..., metadata={"owner": "team-b"})) +assert feature1.feature_version() == feature2.feature_version() +``` + ## Important Constraints ### Narwhals as the Public Interface diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 4ac81e87e..c45d0769c 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -327,6 +327,32 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: ) return self + @pydantic.model_validator(mode="after") + def validate_metadata_json_serializable(self) -> "FeatureSpec": + """Validate that metadata is JSON-serializable. + + This ensures that metadata can be safely serialized for storage, + transmission, and graph snapshots. + + Note: Metadata is kept as a mutable dict for Pydantic serialization compatibility, + but users should treat it as immutable. The frozen FeatureSpec model prevents + reassignment of the metadata field itself. + + Raises: + ValueError: If metadata contains non-JSON-serializable types + """ + if self.metadata is not None: + try: + # Attempt to serialize and deserialize to validate + json.dumps(self.metadata) + except (TypeError, ValueError) as e: + raise ValueError( + f"metadata must be JSON-serializable. " + f"Found non-serializable value: {e}" + ) from e + + return self + @property def feature_spec_version(self) -> str: """Compute SHA256 hash of the complete feature specification. diff --git a/tests/__snapshots__/test_spec_version.ambr b/tests/__snapshots__/test_spec_version.ambr index 013c3de5b..f899a73cc 100644 --- a/tests/__snapshots__/test_spec_version.ambr +++ b/tests/__snapshots__/test_spec_version.ambr @@ -34,6 +34,7 @@ 'snapshot', 'test', ]), + 'metadata': None, }), 'feature_spec_version': '63d427712eb566d6c066c2514693714d82df604af91b48eae453634c4d684339', 'feature_tracking_version': 'dabc7e8b54edb77992145840b2df9c81d4e384f3cdb1736bd3d5ecb83b0f422a', diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py new file mode 100644 index 000000000..56b57a75d --- /dev/null +++ b/tests/test_feature_metadata.py @@ -0,0 +1,413 @@ +"""Tests for metadata parameter on FeatureSpec.""" + +import json +from typing import Any + +import pytest +from hypothesis import given +from hypothesis import strategies as st + +from metaxy import Feature, FeatureDep, FeatureKey, FeatureSpec, FieldKey, FieldSpec +from metaxy.models.feature import FeatureGraph + + +def test_metadata_basic_usage() -> None: + """Test basic metadata usage with FeatureSpec.""" + + class TestFeature( + Feature, + spec=FeatureSpec( + key=FeatureKey(["test", "metadata"]), + deps=None, + metadata={ + "owner": "data-team", + "sla": "24h", + "description": "Test feature with metadata", + }, + ), + ): + pass + + # Access metadata + assert TestFeature.spec.metadata is not None + assert TestFeature.spec.metadata["owner"] == "data-team" + assert TestFeature.spec.metadata["sla"] == "24h" + assert TestFeature.spec.metadata["description"] == "Test feature with metadata" + + +def test_metadata_does_not_affect_feature_version() -> None: + """Metadata changes should NOT change feature_version().""" + graph1 = FeatureGraph() + graph2 = FeatureGraph() + + with graph1.use(): + + class Feature1( + Feature, + spec=FeatureSpec( + key=FeatureKey(["metadata_version_test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"owner": "team-a"}, + ), + ): + pass + + with graph2.use(): + + class Feature2( + Feature, + spec=FeatureSpec( + key=FeatureKey(["metadata_version_test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"owner": "team-b"}, # Different metadata! + ), + ): + pass + + # feature_version should be the SAME (metadata doesn't affect it) + assert Feature1.feature_version() == Feature2.feature_version() + + +def test_metadata_does_not_affect_code_version() -> None: + """Metadata changes should NOT change code_version property.""" + graph1 = FeatureGraph() + graph2 = FeatureGraph() + + with graph1.use(): + + class FeatureA( + Feature, + spec=FeatureSpec( + key=FeatureKey(["metadata_code_test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"tag": "v1"}, + ), + ): + pass + + code_v1 = FeatureA.code_version() + + with graph2.use(): + + class FeatureB( + Feature, + spec=FeatureSpec( + key=FeatureKey(["metadata_code_test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"tag": "v2"}, # Different metadata! + ), + ): + pass + + code_v2 = FeatureB.code_version() + + # code_version should be the SAME (metadata doesn't affect it) + assert code_v1 == code_v2 + + +def test_metadata_affects_feature_spec_version() -> None: + """Metadata changes SHOULD change feature_spec_version (for audit trail).""" + spec1 = FeatureSpec( + key=FeatureKey(["test", "spec_version"]), + deps=None, + metadata={"owner": "team-a"}, + ) + + spec2 = FeatureSpec( + key=FeatureKey(["test", "spec_version"]), + deps=None, + metadata={"owner": "team-b"}, + ) + + # feature_spec_version should be DIFFERENT (includes metadata) + assert spec1.feature_spec_version != spec2.feature_spec_version + + +def test_metadata_json_serializable_valid() -> None: + """Test that valid JSON-serializable metadata is accepted.""" + valid_metadata = { + "string": "value", + "number": 42, + "float": 3.14, + "bool": True, + "null": None, + "list": [1, 2, 3], + "nested": {"key": "value"}, + "nested_list": [{"a": 1}, {"b": 2}], + } + + spec = FeatureSpec( + key=FeatureKey(["test", "json"]), deps=None, metadata=valid_metadata + ) + + # Should be able to serialize + assert spec.metadata is not None + serialized = json.dumps(dict(spec.metadata)) + deserialized: dict[str, Any] = json.loads(serialized) + + # Values should match + assert deserialized["string"] == "value" + assert deserialized["number"] == 42 + assert deserialized["float"] == 3.14 + assert deserialized["bool"] is True + assert deserialized["null"] is None + assert deserialized["list"] == [1, 2, 3] + assert deserialized["nested"] == {"key": "value"} + + +def test_metadata_json_serializable_invalid() -> None: + """Test that non-JSON-serializable metadata is rejected.""" + + # Lambda function is not JSON-serializable + with pytest.raises(ValueError, match="metadata must be JSON-serializable"): + FeatureSpec( + key=FeatureKey(["test", "invalid"]), + deps=None, + metadata={"func": lambda x: x}, + ) + + # Set is not JSON-serializable + with pytest.raises(ValueError, match="metadata must be JSON-serializable"): + FeatureSpec( + key=FeatureKey(["test", "invalid2"]), + deps=None, + metadata={"myset": {1, 2, 3}}, + ) + + # Custom object is not JSON-serializable + class CustomObject: + pass + + with pytest.raises(ValueError, match="metadata must be JSON-serializable"): + FeatureSpec( + key=FeatureKey(["test", "invalid3"]), + deps=None, + metadata={"obj": CustomObject()}, + ) + + +def test_metadata_not_none() -> None: + """Test that metadata dict exists when set.""" + spec = FeatureSpec( + key=FeatureKey(["test", "not_none"]), + deps=None, + metadata={"key": "value"}, + ) + + # Metadata should exist + assert spec.metadata is not None + assert "key" in spec.metadata + assert spec.metadata["key"] == "value" + + +def test_metadata_none_by_default() -> None: + """Test that metadata is None by default.""" + spec = FeatureSpec(key=FeatureKey(["test", "default"]), deps=None) + + assert spec.metadata is None + + +def test_metadata_empty_dict() -> None: + """Test that empty dict metadata is accepted.""" + spec = FeatureSpec(key=FeatureKey(["test", "empty"]), deps=None, metadata={}) + + # Should be empty but not None + assert spec.metadata is not None + assert len(spec.metadata) == 0 + + +def test_metadata_complex_structure() -> None: + """Test metadata with complex nested structures.""" + complex_metadata = { + "owner": "data-team", + "sla": "24h", + "tags": ["customer", "profile", "enrichment"], + "cost_tier": "high", + "pii": True, + "custom_config": { + "refresh_interval": "1h", + "alert_threshold": 0.95, + "retry_policy": { + "max_retries": 3, + "backoff_multiplier": 2, + }, + }, + "contacts": [ + {"name": "Alice", "email": "alice@example.com"}, + {"name": "Bob", "email": "bob@example.com"}, + ], + } + + spec = FeatureSpec( + key=FeatureKey(["test", "complex"]), deps=None, metadata=complex_metadata + ) + + # Should be accessible + assert spec.metadata is not None + assert spec.metadata["owner"] == "data-team" + assert spec.metadata["tags"] == ["customer", "profile", "enrichment"] + assert spec.metadata["custom_config"]["refresh_interval"] == "1h" # type: ignore[index] + assert spec.metadata["custom_config"]["retry_policy"]["max_retries"] == 3 # type: ignore[index] + assert spec.metadata["contacts"][0]["name"] == "Alice" # type: ignore[index] + + +def test_metadata_with_feature_usage_example() -> None: + """Test real-world usage example from documentation.""" + + class ParentFeature( + Feature, + spec=FeatureSpec( + key=FeatureKey(["example", "parent"]), + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + ), + ): + pass + + class CustomerFeature( + Feature, + spec=FeatureSpec( + key=FeatureKey(["example", "customer"]), + deps=[FeatureDep(key=FeatureKey(["example", "parent"]))], + fields=[ + FieldSpec(key=FieldKey(["age"]), code_version=1), + FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), + ], + metadata={ + "owner": "data-team", + "sla": "24h", + "description": "Customer profile enrichment", + "tags": ["customer", "profile", "enrichment"], + "cost_tier": "high", + "pii": True, + "custom_config": { + "refresh_interval": "1h", + "alert_threshold": 0.95, + }, + }, + ), + ): + pass + + # Access metadata + assert CustomerFeature.spec.metadata is not None + assert CustomerFeature.spec.metadata["owner"] == "data-team" + assert CustomerFeature.spec.metadata["pii"] is True + assert CustomerFeature.spec.metadata["custom_config"]["refresh_interval"] == "1h" # type: ignore[index] + + +def test_metadata_serialization_in_model_dump() -> None: + """Test that metadata is included in model_dump().""" + spec = FeatureSpec( + key=FeatureKey(["test", "dump"]), + deps=None, + metadata={"owner": "team-a"}, + ) + + dumped = spec.model_dump(mode="json") + + # Metadata should be in the dump + assert "metadata" in dumped + assert dumped["metadata"]["owner"] == "team-a" # type: ignore[index] + + +# Property-based tests using Hypothesis + + +@given( + owner=st.text(min_size=1, max_size=50), + sla=st.text(min_size=1, max_size=20), +) +def test_property_metadata_does_not_affect_feature_version( + owner: str, sla: str +) -> None: + """Property test: different metadata values don't change feature_version.""" + graph1 = FeatureGraph() + graph2 = FeatureGraph() + + with graph1.use(): + + class Feature1( + Feature, + spec=FeatureSpec( + key=FeatureKey(["property", "test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"owner": owner, "sla": sla}, + ), + ): + pass + + v1 = Feature1.feature_version() + + with graph2.use(): + + class Feature2( + Feature, + spec=FeatureSpec( + key=FeatureKey(["property", "test"]), # Same key! + deps=None, + fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], + metadata={"owner": "different", "sla": "different"}, + ), + ): + pass + + v2 = Feature2.feature_version() + + # feature_version should be the same regardless of metadata + assert v1 == v2 + + +@given( + metadata_dict=st.dictionaries( + keys=st.text(min_size=1, max_size=20), + values=st.one_of( + st.text(max_size=100), + st.integers(), + st.floats(allow_nan=False, allow_infinity=False), + st.booleans(), + st.none(), + ), + min_size=1, + max_size=10, + ) +) +def test_property_metadata_json_serializable(metadata_dict: dict[str, Any]) -> None: + """Property test: randomly generated metadata should be JSON-serializable.""" + spec = FeatureSpec( + key=FeatureKey(["property", "json"]), deps=None, metadata=metadata_dict + ) + + # Should be able to serialize and deserialize + assert spec.metadata is not None + serialized = json.dumps(dict(spec.metadata)) + deserialized: dict[str, Any] = json.loads(serialized) + + # Check a sample key if present + if metadata_dict: + sample_key = list(metadata_dict.keys())[0] + assert sample_key in deserialized + + +@given( + num_keys=st.integers(min_value=1, max_value=20), +) +def test_property_metadata_access(num_keys: int) -> None: + """Property test: metadata should be accessible with various numbers of keys.""" + metadata = {f"key_{i}": f"value_{i}" for i in range(num_keys)} + + spec = FeatureSpec( + key=FeatureKey(["property", "access"]), + deps=None, + metadata=metadata, + ) + + # Should be able to access all keys + assert spec.metadata is not None + for i in range(num_keys): + assert spec.metadata[f"key_{i}"] == f"value_{i}" From 724e1faea0f1d52db968bd9ef9b9649a271720d1 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 22:15:18 +0100 Subject: [PATCH 19/73] fix: refine snapshots again with pre-commit hook --- tests/cli/__snapshots__/test_cli_graph.ambr | 8 ++++---- tests/examples/__snapshots__/test_recompute.ambr | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/cli/__snapshots__/test_cli_graph.ambr b/tests/cli/__snapshots__/test_cli_graph.ambr index c925e2837..e60e527f5 100644 --- a/tests/cli/__snapshots__/test_cli_graph.ambr +++ b/tests/cli/__snapshots__/test_cli_graph.ambr @@ -7,17 +7,17 @@ labelloc=t; fontsize=14; fontname=helvetica; - + "examples/parent" ; "examples/child" ; - + "examples/parent" -> "examples/child"; - + "examples/parent::embeddings" ; "examples/parent" -> "examples/parent::embeddings" ; "examples/child::predictions" ; "examples/child" -> "examples/child::predictions" ; } - + ''' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 13f6cf1c9..580ff420b 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -15,7 +15,7 @@ Fields: predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) - + ''' # --- # name: test_list_features[2] @@ -34,6 +34,6 @@ Fields: predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) - + ''' # --- From 6adb1c6cdd8f94ebd04cda1e3ea3582c3f867838 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 22:27:51 +0100 Subject: [PATCH 20/73] fix: snapshot newlines --- tests/cli/__snapshots__/test_cli_graph.ambr | 8 ++++---- .../__snapshots__/test_recompute.ambr | 20 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/cli/__snapshots__/test_cli_graph.ambr b/tests/cli/__snapshots__/test_cli_graph.ambr index e60e527f5..c925e2837 100644 --- a/tests/cli/__snapshots__/test_cli_graph.ambr +++ b/tests/cli/__snapshots__/test_cli_graph.ambr @@ -7,17 +7,17 @@ labelloc=t; fontsize=14; fontname=helvetica; - + "examples/parent" ; "examples/child" ; - + "examples/parent" -> "examples/child"; - + "examples/parent::embeddings" ; "examples/parent" -> "examples/parent::embeddings" ; "examples/child::predictions" ; "examples/child" -> "examples/child::predictions" ; } - + ''' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 580ff420b..0e4bdc1a3 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -2,38 +2,38 @@ # name: test_list_features[1] ''' --- - examples/parent (version + examples/parent (version 0aad9b8a2ea055cde3c4149fc4cde576e6478a982cea75b45c3cd012db43a5e8) Fields: - embeddings (code_version 1, version + embeddings (code_version 1, version 05e66510da58ef37168095b60188849cd6b1f0a4b539d0ac29ffd1e15b756459) --- - examples/child (version + examples/child (version 440ffb028aaa5cb21b155c4ef21debd81f283f99aa91ef58cbe541d71164b44f) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) - + ''' # --- # name: test_list_features[2] ''' --- - examples/parent (version + examples/parent (version a007f308d0a852e3fbf80a442bd0089e29eed94efefe85231ef4fc927aa7d737) Fields: - embeddings (code_version 2, version + embeddings (code_version 2, version 3c8d3e9ba031ab3613eb4db0877d3959fce76d94d625335b89bae7fbd4f27add) --- - examples/child (version + examples/child (version 7251e21c32d2d8e35a8ba389a8ce1b597663f206dee0ff55e542a3af1f1665cd) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) - + ''' # --- From 1280ef0aeb0f765da61920f532595eab80f20a47 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 06:48:40 +0100 Subject: [PATCH 21/73] fix: claude --- CLAUDE.md | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 85aad3f9a..592f889da 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -304,10 +304,24 @@ class CustomerFeature(Feature, spec=FeatureSpec( # Access metadata CustomerFeature.spec.metadata["owner"] # "data-team" -# Metadata doesn't affect versioning (these are the same): -feature1 = Feature(spec=FeatureSpec(..., metadata={"owner": "team-a"})) -feature2 = Feature(spec=FeatureSpec(..., metadata={"owner": "team-b"})) -assert feature1.feature_version() == feature2.feature_version() +# Metadata doesn't affect versioning (these produce the same feature_version): +class Feature1(Feature, spec=FeatureSpec( + key=FeatureKey(["example"]), + deps=[], + fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], + metadata={"owner": "team-a"} +)): + pass + +class Feature2(Feature, spec=FeatureSpec( + key=FeatureKey(["example"]), + deps=[], + fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], + metadata={"owner": "team-b"} +)): + pass + +assert Feature1.feature_version() == Feature2.feature_version() ``` ## Important Constraints From 8a68d6c8928b724e82e0ecc125f2a095d74e1e08 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 06:54:22 +0100 Subject: [PATCH 22/73] fix: upgrade CI --- .github/workflows/QA.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 831d4946f..fb86cbbd7 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -37,7 +37,7 @@ jobs: runs-on: depot-ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install uv uses: astral-sh/setup-uv@v7 with: @@ -49,7 +49,7 @@ jobs: runs-on: depot-ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install uv uses: astral-sh/setup-uv@v7 with: From 0e3d6e71f5b6703ac50f4c4e9562a3f5d1f8e289 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 21:41:26 +0100 Subject: [PATCH 23/73] fix: refine impl --- CLAUDE.md | 78 +++--- docs/learn/feature-definitions.md | 2 + pyproject.toml | 1 + src/metaxy/models/feature_spec.py | 18 ++ tests/test_feature_metadata.py | 390 +++--------------------------- uv.lock | 21 ++ 6 files changed, 120 insertions(+), 390 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 592f889da..377f26e91 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -271,7 +271,9 @@ Features can override `load_input()` for custom join logic: This is critical for migrations when upstream dependencies change. #### User-Defined Metadata + Features can include user-defined metadata for documentation and tooling purposes: + - **metadata parameter**: Optional dict on FeatureSpec for attaching arbitrary information - **No effect on versioning**: metadata does NOT affect `feature_version()` or `code_version()` - **Affects spec version**: metadata IS included in `feature_spec_version` for audit trail @@ -279,48 +281,62 @@ Features can include user-defined metadata for documentation and tooling purpose - **Use cases**: Owner, team, SLA, description, tags, custom configuration Example: + ```python -class CustomerFeature(Feature, spec=FeatureSpec( - key=FeatureKey(["customer"]), - deps=[FeatureDep(key=FeatureKey(["user"]))], - fields=[ - FieldSpec(key=FieldKey(["age"]), code_version=1), - FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), - ], - metadata={ - "owner": "data-team", - "sla": "24h", - "description": "Customer profile enrichment", - "tags": ["customer", "profile", "enrichment"], - "pii": True, - "custom_config": { - "refresh_interval": "1h", - "alert_threshold": 0.95, - } - } -)): +class CustomerFeature( + Feature, + spec=FeatureSpec( + key=FeatureKey(["customer"]), + deps=[FeatureDep(key=FeatureKey(["user"]))], + fields=[ + FieldSpec(key=FieldKey(["age"]), code_version=1), + FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), + ], + metadata={ + "owner": "data-team", + "sla": "24h", + "description": "Customer profile enrichment", + "tags": ["customer", "profile", "enrichment"], + "pii": True, + "custom_config": { + "refresh_interval": "1h", + "alert_threshold": 0.95, + }, + }, + ), +): pass + # Access metadata CustomerFeature.spec.metadata["owner"] # "data-team" + # Metadata doesn't affect versioning (these produce the same feature_version): -class Feature1(Feature, spec=FeatureSpec( - key=FeatureKey(["example"]), - deps=[], - fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], - metadata={"owner": "team-a"} -)): +class Feature1( + Feature, + spec=FeatureSpec( + key=FeatureKey(["example"]), + deps=[], + fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], + metadata={"owner": "team-a"}, + ), +): pass -class Feature2(Feature, spec=FeatureSpec( - key=FeatureKey(["example"]), - deps=[], - fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], - metadata={"owner": "team-b"} -)): + +class Feature2( + Feature, + spec=FeatureSpec( + key=FeatureKey(["example"]), + deps=[], + fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], + metadata={"owner": "team-b"}, + ), +): pass + assert Feature1.feature_version() == Feature2.feature_version() ``` diff --git a/docs/learn/feature-definitions.md b/docs/learn/feature-definitions.md index cdbab7ba9..3901ff5a7 100644 --- a/docs/learn/feature-definitions.md +++ b/docs/learn/feature-definitions.md @@ -36,6 +36,8 @@ class VideoFeatureSpec(BaseFeatureSpec[VideoIds]): `BaseFeatureSpec` is a [Pydantic](https://docs.pydantic.dev/latest/) model, so all normal Pydantic features apply. +Feature specs now support an optional `metadata` dictionary for attaching ownership, documentation, or tooling context to a feature. This metadata **never** influences graph topology or version hashes, must be JSON-serializable, and is stored as an immutable [`frozendict`](https://pypi.org/project/frozendict/) once the spec is created (list values are frozen as tuples to guarantee immutability). It is ideal for values such as owners, SLAs, runbooks, or tags that external systems may want to inspect. + With our `VideoFeatureSpec` in place, we can proceed to defining features that would be using it. ## Feature Definitions diff --git a/pyproject.toml b/pyproject.toml index 7161c27f8..53b6623db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ requires-python = ">=3.10" dependencies = [ "cyclopts==4.0.0b1", + "frozendict>=2.4.4", "narwhals>=2.9.0", "polars>=1.33.1", "polars-hash>=0.5.1", diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index c45d0769c..2873f52ef 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -18,6 +18,7 @@ ) import pydantic +from frozendict import frozendict from pydantic import BeforeValidator from typing_extensions import Self @@ -197,6 +198,19 @@ def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.feature.table_name + +def _freeze_metadata(value: Any) -> Any: + """Recursively freeze metadata containers to enforce immutability.""" + if isinstance(value, frozendict): + return value + if isinstance(value, Mapping): + return frozendict({k: _freeze_metadata(v) for k, v in value.items()}) + if isinstance(value, list): + return tuple(_freeze_metadata(v) for v in value) + if isinstance(value, tuple): + return tuple(_freeze_metadata(v) for v in value) + return value + IDColumns: TypeAlias = Sequence[ str @@ -231,6 +245,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: dict[str, Any] | None = None, ) -> None: """Initialize from string key.""" ... @@ -243,6 +258,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: dict[str, Any] | None = None, ) -> None: """Initialize from sequence of parts.""" ... @@ -255,6 +271,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: dict[str, Any] | None = None, ) -> None: """Initialize from FeatureKey instance.""" ... @@ -350,6 +367,7 @@ def validate_metadata_json_serializable(self) -> "FeatureSpec": f"metadata must be JSON-serializable. " f"Found non-serializable value: {e}" ) from e + object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) return self diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py index 56b57a75d..4c880a9ab 100644 --- a/tests/test_feature_metadata.py +++ b/tests/test_feature_metadata.py @@ -1,134 +1,49 @@ -"""Tests for metadata parameter on FeatureSpec.""" - import json -from typing import Any import pytest -from hypothesis import given -from hypothesis import strategies as st +from frozendict import frozendict -from metaxy import Feature, FeatureDep, FeatureKey, FeatureSpec, FieldKey, FieldSpec +from metaxy import Feature, FeatureKey, FeatureSpec from metaxy.models.feature import FeatureGraph -def test_metadata_basic_usage() -> None: - """Test basic metadata usage with FeatureSpec.""" - - class TestFeature( - Feature, - spec=FeatureSpec( - key=FeatureKey(["test", "metadata"]), - deps=None, - metadata={ - "owner": "data-team", - "sla": "24h", - "description": "Test feature with metadata", - }, - ), - ): - pass - - # Access metadata - assert TestFeature.spec.metadata is not None - assert TestFeature.spec.metadata["owner"] == "data-team" - assert TestFeature.spec.metadata["sla"] == "24h" - assert TestFeature.spec.metadata["description"] == "Test feature with metadata" - - -def test_metadata_does_not_affect_feature_version() -> None: - """Metadata changes should NOT change feature_version().""" - graph1 = FeatureGraph() - graph2 = FeatureGraph() +def test_metadata_does_not_affect_version() -> None: + """Metadata differences should not change feature version hashes.""" + graph_a = FeatureGraph() + with graph_a.use(): - with graph1.use(): - - class Feature1( + class MetadataFeatureA( Feature, spec=FeatureSpec( - key=FeatureKey(["metadata_version_test"]), # Same key! + key=FeatureKey(["tests", "metadata", "same"]), deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], metadata={"owner": "team-a"}, ), ): pass - with graph2.use(): - - class Feature2( - Feature, - spec=FeatureSpec( - key=FeatureKey(["metadata_version_test"]), # Same key! - deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - metadata={"owner": "team-b"}, # Different metadata! - ), - ): - pass - - # feature_version should be the SAME (metadata doesn't affect it) - assert Feature1.feature_version() == Feature2.feature_version() + version_a = MetadataFeatureA.feature_version() + graph_b = FeatureGraph() + with graph_b.use(): -def test_metadata_does_not_affect_code_version() -> None: - """Metadata changes should NOT change code_version property.""" - graph1 = FeatureGraph() - graph2 = FeatureGraph() - - with graph1.use(): - - class FeatureA( - Feature, - spec=FeatureSpec( - key=FeatureKey(["metadata_code_test"]), # Same key! - deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - metadata={"tag": "v1"}, - ), - ): - pass - - code_v1 = FeatureA.code_version() - - with graph2.use(): - - class FeatureB( + class MetadataFeatureB( Feature, spec=FeatureSpec( - key=FeatureKey(["metadata_code_test"]), # Same key! + key=FeatureKey(["tests", "metadata", "same"]), deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - metadata={"tag": "v2"}, # Different metadata! + metadata={"owner": "team-b"}, ), ): pass - code_v2 = FeatureB.code_version() - - # code_version should be the SAME (metadata doesn't affect it) - assert code_v1 == code_v2 - - -def test_metadata_affects_feature_spec_version() -> None: - """Metadata changes SHOULD change feature_spec_version (for audit trail).""" - spec1 = FeatureSpec( - key=FeatureKey(["test", "spec_version"]), - deps=None, - metadata={"owner": "team-a"}, - ) - - spec2 = FeatureSpec( - key=FeatureKey(["test", "spec_version"]), - deps=None, - metadata={"owner": "team-b"}, - ) + version_b = MetadataFeatureB.feature_version() - # feature_spec_version should be DIFFERENT (includes metadata) - assert spec1.feature_spec_version != spec2.feature_spec_version + assert version_a == version_b -def test_metadata_json_serializable_valid() -> None: - """Test that valid JSON-serializable metadata is accepted.""" +def test_metadata_json_serializable() -> None: + """Validate JSON serialization enforcement for metadata.""" valid_metadata = { "string": "value", "number": 42, @@ -137,277 +52,34 @@ def test_metadata_json_serializable_valid() -> None: "null": None, "list": [1, 2, 3], "nested": {"key": "value"}, - "nested_list": [{"a": 1}, {"b": 2}], } spec = FeatureSpec( - key=FeatureKey(["test", "json"]), deps=None, metadata=valid_metadata + key=FeatureKey(["tests", "metadata", "json"]), + deps=None, + metadata=valid_metadata, ) - - # Should be able to serialize assert spec.metadata is not None - serialized = json.dumps(dict(spec.metadata)) - deserialized: dict[str, Any] = json.loads(serialized) - - # Values should match - assert deserialized["string"] == "value" - assert deserialized["number"] == 42 - assert deserialized["float"] == 3.14 - assert deserialized["bool"] is True - assert deserialized["null"] is None - assert deserialized["list"] == [1, 2, 3] - assert deserialized["nested"] == {"key": "value"} + assert isinstance(spec.metadata, frozendict) + assert isinstance(spec.metadata["list"], tuple) + assert json.dumps(dict(spec.metadata)) is not None - -def test_metadata_json_serializable_invalid() -> None: - """Test that non-JSON-serializable metadata is rejected.""" - - # Lambda function is not JSON-serializable - with pytest.raises(ValueError, match="metadata must be JSON-serializable"): + with pytest.raises(ValueError): FeatureSpec( - key=FeatureKey(["test", "invalid"]), + key=FeatureKey(["tests", "metadata", "json"]), deps=None, metadata={"func": lambda x: x}, ) - # Set is not JSON-serializable - with pytest.raises(ValueError, match="metadata must be JSON-serializable"): - FeatureSpec( - key=FeatureKey(["test", "invalid2"]), - deps=None, - metadata={"myset": {1, 2, 3}}, - ) - - # Custom object is not JSON-serializable - class CustomObject: - pass - with pytest.raises(ValueError, match="metadata must be JSON-serializable"): - FeatureSpec( - key=FeatureKey(["test", "invalid3"]), - deps=None, - metadata={"obj": CustomObject()}, - ) - - -def test_metadata_not_none() -> None: - """Test that metadata dict exists when set.""" +def test_metadata_immutable() -> None: + """Metadata mapping should be immutable after initialization.""" spec = FeatureSpec( - key=FeatureKey(["test", "not_none"]), + key=FeatureKey(["tests", "metadata", "immutable"]), deps=None, metadata={"key": "value"}, ) - - # Metadata should exist - assert spec.metadata is not None - assert "key" in spec.metadata - assert spec.metadata["key"] == "value" - - -def test_metadata_none_by_default() -> None: - """Test that metadata is None by default.""" - spec = FeatureSpec(key=FeatureKey(["test", "default"]), deps=None) - - assert spec.metadata is None - - -def test_metadata_empty_dict() -> None: - """Test that empty dict metadata is accepted.""" - spec = FeatureSpec(key=FeatureKey(["test", "empty"]), deps=None, metadata={}) - - # Should be empty but not None assert spec.metadata is not None - assert len(spec.metadata) == 0 - -def test_metadata_complex_structure() -> None: - """Test metadata with complex nested structures.""" - complex_metadata = { - "owner": "data-team", - "sla": "24h", - "tags": ["customer", "profile", "enrichment"], - "cost_tier": "high", - "pii": True, - "custom_config": { - "refresh_interval": "1h", - "alert_threshold": 0.95, - "retry_policy": { - "max_retries": 3, - "backoff_multiplier": 2, - }, - }, - "contacts": [ - {"name": "Alice", "email": "alice@example.com"}, - {"name": "Bob", "email": "bob@example.com"}, - ], - } - - spec = FeatureSpec( - key=FeatureKey(["test", "complex"]), deps=None, metadata=complex_metadata - ) - - # Should be accessible - assert spec.metadata is not None - assert spec.metadata["owner"] == "data-team" - assert spec.metadata["tags"] == ["customer", "profile", "enrichment"] - assert spec.metadata["custom_config"]["refresh_interval"] == "1h" # type: ignore[index] - assert spec.metadata["custom_config"]["retry_policy"]["max_retries"] == 3 # type: ignore[index] - assert spec.metadata["contacts"][0]["name"] == "Alice" # type: ignore[index] - - -def test_metadata_with_feature_usage_example() -> None: - """Test real-world usage example from documentation.""" - - class ParentFeature( - Feature, - spec=FeatureSpec( - key=FeatureKey(["example", "parent"]), - deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - ), - ): - pass - - class CustomerFeature( - Feature, - spec=FeatureSpec( - key=FeatureKey(["example", "customer"]), - deps=[FeatureDep(key=FeatureKey(["example", "parent"]))], - fields=[ - FieldSpec(key=FieldKey(["age"]), code_version=1), - FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), - ], - metadata={ - "owner": "data-team", - "sla": "24h", - "description": "Customer profile enrichment", - "tags": ["customer", "profile", "enrichment"], - "cost_tier": "high", - "pii": True, - "custom_config": { - "refresh_interval": "1h", - "alert_threshold": 0.95, - }, - }, - ), - ): - pass - - # Access metadata - assert CustomerFeature.spec.metadata is not None - assert CustomerFeature.spec.metadata["owner"] == "data-team" - assert CustomerFeature.spec.metadata["pii"] is True - assert CustomerFeature.spec.metadata["custom_config"]["refresh_interval"] == "1h" # type: ignore[index] - - -def test_metadata_serialization_in_model_dump() -> None: - """Test that metadata is included in model_dump().""" - spec = FeatureSpec( - key=FeatureKey(["test", "dump"]), - deps=None, - metadata={"owner": "team-a"}, - ) - - dumped = spec.model_dump(mode="json") - - # Metadata should be in the dump - assert "metadata" in dumped - assert dumped["metadata"]["owner"] == "team-a" # type: ignore[index] - - -# Property-based tests using Hypothesis - - -@given( - owner=st.text(min_size=1, max_size=50), - sla=st.text(min_size=1, max_size=20), -) -def test_property_metadata_does_not_affect_feature_version( - owner: str, sla: str -) -> None: - """Property test: different metadata values don't change feature_version.""" - graph1 = FeatureGraph() - graph2 = FeatureGraph() - - with graph1.use(): - - class Feature1( - Feature, - spec=FeatureSpec( - key=FeatureKey(["property", "test"]), # Same key! - deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - metadata={"owner": owner, "sla": sla}, - ), - ): - pass - - v1 = Feature1.feature_version() - - with graph2.use(): - - class Feature2( - Feature, - spec=FeatureSpec( - key=FeatureKey(["property", "test"]), # Same key! - deps=None, - fields=[FieldSpec(key=FieldKey(["default"]), code_version=1)], - metadata={"owner": "different", "sla": "different"}, - ), - ): - pass - - v2 = Feature2.feature_version() - - # feature_version should be the same regardless of metadata - assert v1 == v2 - - -@given( - metadata_dict=st.dictionaries( - keys=st.text(min_size=1, max_size=20), - values=st.one_of( - st.text(max_size=100), - st.integers(), - st.floats(allow_nan=False, allow_infinity=False), - st.booleans(), - st.none(), - ), - min_size=1, - max_size=10, - ) -) -def test_property_metadata_json_serializable(metadata_dict: dict[str, Any]) -> None: - """Property test: randomly generated metadata should be JSON-serializable.""" - spec = FeatureSpec( - key=FeatureKey(["property", "json"]), deps=None, metadata=metadata_dict - ) - - # Should be able to serialize and deserialize - assert spec.metadata is not None - serialized = json.dumps(dict(spec.metadata)) - deserialized: dict[str, Any] = json.loads(serialized) - - # Check a sample key if present - if metadata_dict: - sample_key = list(metadata_dict.keys())[0] - assert sample_key in deserialized - - -@given( - num_keys=st.integers(min_value=1, max_value=20), -) -def test_property_metadata_access(num_keys: int) -> None: - """Property test: metadata should be accessible with various numbers of keys.""" - metadata = {f"key_{i}": f"value_{i}" for i in range(num_keys)} - - spec = FeatureSpec( - key=FeatureKey(["property", "access"]), - deps=None, - metadata=metadata, - ) - - # Should be able to access all keys - assert spec.metadata is not None - for i in range(num_keys): - assert spec.metadata[f"key_{i}"] == f"value_{i}" + with pytest.raises(TypeError): + spec.metadata["key"] = "new_value" # type: ignore[index] diff --git a/uv.lock b/uv.lock index c46262fbd..26f464e48 100644 --- a/uv.lock +++ b/uv.lock @@ -518,6 +518,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "frozendict" +version = "2.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/59/19eb300ba28e7547538bdf603f1c6c34793240a90e1a7b61b65d8517e35e/frozendict-2.4.6.tar.gz", hash = "sha256:df7cd16470fbd26fc4969a208efadc46319334eb97def1ddf48919b351192b8e", size = 316416, upload-time = "2024-10-13T12:15:32.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/7f/e80cdbe0db930b2ba9d46ca35a41b0150156da16dfb79edcc05642690c3b/frozendict-2.4.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c3a05c0a50cab96b4bb0ea25aa752efbfceed5ccb24c007612bc63e51299336f", size = 37927, upload-time = "2024-10-13T12:14:17.927Z" }, + { url = "https://files.pythonhosted.org/packages/29/98/27e145ff7e8e63caa95fb8ee4fc56c68acb208bef01a89c3678a66f9a34d/frozendict-2.4.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5b94d5b07c00986f9e37a38dd83c13f5fe3bf3f1ccc8e88edea8fe15d6cd88c", size = 37945, upload-time = "2024-10-13T12:14:19.976Z" }, + { url = "https://files.pythonhosted.org/packages/ac/f1/a10be024a9d53441c997b3661ea80ecba6e3130adc53812a4b95b607cdd1/frozendict-2.4.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c789fd70879ccb6289a603cdebdc4953e7e5dea047d30c1b180529b28257b5", size = 117656, upload-time = "2024-10-13T12:14:22.038Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/34c760975e6f1cb4db59a990d58dcf22287e10241c851804670c74c6a27a/frozendict-2.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da6a10164c8a50b34b9ab508a9420df38f4edf286b9ca7b7df8a91767baecb34", size = 117444, upload-time = "2024-10-13T12:14:24.251Z" }, + { url = "https://files.pythonhosted.org/packages/62/dd/64bddd1ffa9617f50e7e63656b2a7ad7f0a46c86b5f4a3d2c714d0006277/frozendict-2.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9a8a43036754a941601635ea9c788ebd7a7efbed2becba01b54a887b41b175b9", size = 116801, upload-time = "2024-10-13T12:14:26.518Z" }, + { url = "https://files.pythonhosted.org/packages/45/ae/af06a8bde1947277aad895c2f26c3b8b8b6ee9c0c2ad988fb58a9d1dde3f/frozendict-2.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9905dcf7aa659e6a11b8051114c9fa76dfde3a6e50e6dc129d5aece75b449a2", size = 117329, upload-time = "2024-10-13T12:14:28.485Z" }, + { url = "https://files.pythonhosted.org/packages/d2/df/be3fa0457ff661301228f4c59c630699568c8ed9b5480f113b3eea7d0cb3/frozendict-2.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:323f1b674a2cc18f86ab81698e22aba8145d7a755e0ac2cccf142ee2db58620d", size = 37522, upload-time = "2024-10-13T12:14:30.418Z" }, + { url = "https://files.pythonhosted.org/packages/4a/6f/c22e0266b4c85f58b4613fec024e040e93753880527bf92b0c1bc228c27c/frozendict-2.4.6-cp310-cp310-win_arm64.whl", hash = "sha256:eabd21d8e5db0c58b60d26b4bb9839cac13132e88277e1376970172a85ee04b3", size = 34056, upload-time = "2024-10-13T12:14:31.757Z" }, + { url = "https://files.pythonhosted.org/packages/04/13/d9839089b900fa7b479cce495d62110cddc4bd5630a04d8469916c0e79c5/frozendict-2.4.6-py311-none-any.whl", hash = "sha256:d065db6a44db2e2375c23eac816f1a022feb2fa98cbb50df44a9e83700accbea", size = 16148, upload-time = "2024-10-13T12:15:26.839Z" }, + { url = "https://files.pythonhosted.org/packages/ba/d0/d482c39cee2ab2978a892558cf130681d4574ea208e162da8958b31e9250/frozendict-2.4.6-py312-none-any.whl", hash = "sha256:49344abe90fb75f0f9fdefe6d4ef6d4894e640fadab71f11009d52ad97f370b9", size = 16146, upload-time = "2024-10-13T12:15:28.16Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/b6bf6a0de482d7d7d7a2aaac8fdc4a4d0bb24a809f5ddd422aa7060eb3d2/frozendict-2.4.6-py313-none-any.whl", hash = "sha256:7134a2bb95d4a16556bb5f2b9736dceb6ea848fa5b6f3f6c2d6dba93b44b4757", size = 16146, upload-time = "2024-10-13T12:15:29.495Z" }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -999,6 +1018,7 @@ version = "0.0.0" source = { editable = "." } dependencies = [ { name = "cyclopts" }, + { name = "frozendict" }, { name = "narwhals" }, { name = "polars" }, { name = "polars-hash" }, @@ -1064,6 +1084,7 @@ docs = [ [package.metadata] requires-dist = [ { name = "cyclopts", git = "https://github.com/BrianPugh/cyclopts.git?branch=mkdocs-plugin" }, + { name = "frozendict", specifier = ">=2.4.4" }, { name = "ibis-framework", marker = "extra == 'ibis'", specifier = ">=11.0.0" }, { name = "mermaid-py", marker = "extra == 'mermaid'", specifier = ">=0.8.0" }, { name = "narwhals", specifier = ">=2.9.0" }, From 1bfb6feb8505247ee73bc989cde0d2fce91bafe6 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:00:18 +0100 Subject: [PATCH 24/73] fix: lint --- src/metaxy/models/feature_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 2873f52ef..c53e28209 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -345,7 +345,7 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: return self @pydantic.model_validator(mode="after") - def validate_metadata_json_serializable(self) -> "FeatureSpec": + def validate_metadata_json_serializable(self) -> FeatureSpec: """Validate that metadata is JSON-serializable. This ensures that metadata can be safely serialized for storage, From 369b0ddb9b97f3b508f2efe5663f3e23ebdcc2ca Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:24:50 +0100 Subject: [PATCH 25/73] chore: savepoint --- .../test_feature_project_detection.ambr | 2 +- .../test_feature_tracking_version.ambr | 6 +++--- tests/__snapshots__/test_id_columns.ambr | 6 +++--- tests/__snapshots__/test_spec_version.ambr | 19 +++++++++++++++---- .../__snapshots__/test_snapshot_push.ambr | 10 +++++++++- 5 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tests/__snapshots__/test_feature_project_detection.ambr b/tests/__snapshots__/test_feature_project_detection.ambr index 1cbe30459..1986f6fef 100644 --- a/tests/__snapshots__/test_feature_project_detection.ambr +++ b/tests/__snapshots__/test_feature_project_detection.ambr @@ -11,7 +11,7 @@ # name: test_feature_project_persists_across_graph_operations dict({ 'project': 'persist_project', - 'tracking_version': '6411d6d0', + 'tracking_version': '2a72c9e2', }) # --- # name: test_multiple_features_same_project diff --git a/tests/__snapshots__/test_feature_tracking_version.ambr b/tests/__snapshots__/test_feature_tracking_version.ambr index 49fb53488..f37043726 100644 --- a/tests/__snapshots__/test_feature_tracking_version.ambr +++ b/tests/__snapshots__/test_feature_tracking_version.ambr @@ -12,8 +12,8 @@ 'feature_versions_same': True, 'project_a': 'project_a', 'project_b': 'project_b', - 'tracking_version_a': 'fc7bde342b5583a99961c5c1a5c6188d362a691d86d6c61d0b92cb03115f6ff9', - 'tracking_version_b': 'd6b58547ece9d614fd6194165ab23277d4d7db76656709f260e4866f02f18cb5', + 'tracking_version_a': '7da02a89202f8eb9f19782512fadf988b815df8255e52a1f35dec9da76e18b12', + 'tracking_version_b': '5908b5019bdc1acd103482f1d144338f407f50d9d1aa09ca3808fe376f5ffd98', 'tracking_versions_differ': True, }) # --- @@ -43,5 +43,5 @@ }) # --- # name: test_tracking_version_deterministic - '16cd8a38139da8d77590ab9c4503b68c376bc83b9a7e38b9d793e23c7efd8172' + '139c7982886b761076245c878e6ad545f4afb9ffabab9238d7af883aff161b9b' # --- diff --git a/tests/__snapshots__/test_id_columns.ambr b/tests/__snapshots__/test_id_columns.ambr index d8afcaefa..21c11dbe0 100644 --- a/tests/__snapshots__/test_id_columns.ambr +++ b/tests/__snapshots__/test_id_columns.ambr @@ -1,8 +1,8 @@ # serializer version: 1 # name: test_snapshot_stability_with_id_columns dict({ - 'composite': 'fe41246bb432835bcfced693551b7bdb25800be120315ebe353a4fea51c43523', - 'default': '92cb08d343fc7b981059efceeb076d4c0e5d5b60e18e25860a24abf5c94ee320', - 'single_custom': 'd23b1833c20c5d9da30008c1655c0bb09f9ed2c8bab27e42ee63c2c5bfdf7a14', + 'composite': '8dc629cb2bc4bf404e15495f03c7efdcb339b96531fe6579d3f927a82589f843', + 'default': '0f5dac77217aa051fd504566de0fa3e1d8d4a360ed7958e716ff0f6ebb483a34', + 'single_custom': 'cfca9196fcb832b9da472042d9fd1ae5c485352d8e409d3d7c1f739880249c22', }) # --- diff --git a/tests/__snapshots__/test_spec_version.ambr b/tests/__snapshots__/test_spec_version.ambr index f899a73cc..21d30fbd8 100644 --- a/tests/__snapshots__/test_spec_version.ambr +++ b/tests/__snapshots__/test_spec_version.ambr @@ -1,14 +1,18 @@ # serializer version: 1 # name: test_feature_spec_version_deterministic - 'd58f485df70cf7afb2e0fc7e54eebf52234a1be0c22015ccd3239739077aae52' + 'b0741037ecd180b5948761b9d73b4dd78efa9cf7224240d3e3b718bae26e5852' # --- # name: test_feature_spec_version_includes_all_properties +<<<<<<< HEAD '9421fb545f2ace1ffb1e07c2d14cc62427d0d261a3b509c8a1473f574efc8cce' +======= + '57050f5f60b0044e84c6b2e82c12dc3fa1bbe03e2db4230255356268a6a50edc' +>>>>>>> 768008a (chore: savepoint) # --- # name: test_feature_spec_version_recorded_in_metadata_store dict({ 'feature_key': 'recorded/feature', - 'feature_spec_version': '1103e6a29e9cc569cde596388b9da0961f2e308ae7d492013dce6bbf9566994c', + 'feature_spec_version': '39bcce7c7d9bc23f78982655717dc56879265b87304a96f371c7935df5c59d54', 'feature_version': '0c7b2d83252fbf2f689bec7d37c4f7ffb103755d11eafabc1e000a372f415d83', 'snapshot_version': '14d4294da40f4ab27ca85eb739fffd70e92c92522dbcba995cfa2aa343988bf3', }) @@ -36,15 +40,22 @@ ]), 'metadata': None, }), - 'feature_spec_version': '63d427712eb566d6c066c2514693714d82df604af91b48eae453634c4d684339', - 'feature_tracking_version': 'dabc7e8b54edb77992145840b2df9c81d4e384f3cdb1736bd3d5ecb83b0f422a', + 'feature_spec_version': 'ceaebb2a96f45752b2ca7daf8326b893d029cc56b366d7ee67bd4a3667008a03', + 'feature_tracking_version': '7dac08b916cbef01ec6971f5dd00a1c520c31ed6c09d1fd2223e82ed418546fb', 'feature_version': '7cfde77960e3cf10327e9cb97f311418b101fc5ee2c146922a997306e318edaf', 'project': 'test', }) # --- # name: test_feature_spec_version_with_column_selection_and_rename +<<<<<<< HEAD '9bdc3eb37440620d259db2709010b3d920a7c3147ba30d370494cb3fecc787fc' # --- # name: test_feature_spec_version_with_multiple_complex_deps '3a24675ffa9a6a053dd6738cfe1706470cabc458ec5702d5837138a205bdf4fc' +======= + '00612bc7d53f750ed1c293f7fa7af30156957d643dd4a737501bf1ed7fbb30fe' +# --- +# name: test_feature_spec_version_with_multiple_complex_deps + '1ed84211824fea4af008270ecec7fe3307473a0b19a48f29fd9014b680568882' +>>>>>>> 768008a (chore: savepoint) # --- diff --git a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr index ff3830bf3..101319f65 100644 --- a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr +++ b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr @@ -29,19 +29,27 @@ 'versions': list([ dict({ 'feature_key': 'downstream', +<<<<<<< HEAD 'feature_spec_version': 'c7e67b870148746be6b9caf37ea7fc6c46acb858c8ec31ff2f2961f0a5ac1754', +======= + 'feature_spec_version': '94ac4803ccb1618a54c2621f2cac20c0021da72d16fa89f84594293e6858e8e0', +>>>>>>> 768008a (chore: savepoint) 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), dict({ 'feature_key': 'upstream', - 'feature_spec_version': '66c1d90d5ff75442180f421208a4bfdf551d99b1814dbebddeed0a4a8b07423a', + 'feature_spec_version': 'a8e3517bbf15ca3ecc14d6300c3c78ace14765963993cfc5ffbee8512c6a7655', 'feature_version': '8a2ffeab8da447095c5ee7a77e5635a1e16e7f3605021732f50f7002fa258398', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), dict({ 'feature_key': 'downstream', +<<<<<<< HEAD 'feature_spec_version': 'bd492837d4324369f47605ee647b7aaeb5db68ac240a85a40fbd9e59351afa2a', +======= + 'feature_spec_version': '0f0db4be0886aefa1a7885420c592baff273f0429fdcd9644702c469b849d139', +>>>>>>> 768008a (chore: savepoint) 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), From d578edb603841805247678f2e16245d79354ec52 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:58:19 +0100 Subject: [PATCH 26/73] fix: cleanup after rebase --- src/metaxy/models/feature.py | 6 ++++++ src/metaxy/models/feature_spec.py | 2 +- src/metaxy/utils/typing.py | 7 ++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index 175379ec9..931f17df8 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -828,6 +828,12 @@ class BaseFeature( graph: ClassVar[FeatureGraph] project: ClassVar[str] code_version: ClassVar[str] = _CodeVersionDescriptor() # pyright: ignore[reportAssignmentType] + """Hash of this feature's field code versions only (excludes dependencies). + + Useful for detecting when the feature's own logic changes while remaining stable + if only upstream dependencies mutate; contrast with feature_version(), which + includes dependency hashes. + """ # once ClassVar supports it # this should be replaced by diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index c53e28209..3b7b718d0 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -198,7 +198,7 @@ def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.feature.table_name - + def _freeze_metadata(value: Any) -> Any: """Recursively freeze metadata containers to enforce immutability.""" if isinstance(value, frozendict): diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/typing.py index 9d6fdb135..460c98e36 100644 --- a/src/metaxy/utils/typing.py +++ b/src/metaxy/utils/typing.py @@ -1,5 +1,10 @@ class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" + """Descriptor returning this feature's field-only code version hash. + + The hash is cached on the feature spec (`FeatureSpec.field_code_version_hash`) + and excludes any dependency information, allowing callers to distinguish + between "my code changed" and "one of my dependencies changed". + """ def __get__(self, instance, owner) -> str: if owner.spec is None: From e2dbd3eb16a9ae617350764e9bf3d158d9869076 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 14:39:25 +0100 Subject: [PATCH 27/73] chore: rename, fix --- CLAUDE.md | 2 +- src/metaxy/models/feature.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 377f26e91..d4923531e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -270,7 +270,7 @@ Features can override `load_input()` for custom join logic: This is critical for migrations when upstream dependencies change. -#### User-Defined Metadata +#### Attaching Metadata to Features Features can include user-defined metadata for documentation and tooling purposes: diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index 931f17df8..756f8cbab 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -19,7 +19,6 @@ from metaxy.models.plan import FeaturePlan, FQFieldKey from metaxy.models.types import FeatureKey from metaxy.utils.hashing import truncate_hash -from metaxy.utils.typing import _CodeVersionDescriptor if TYPE_CHECKING: import narwhals as nw From 1e0b963420b9100117458fb52691c5f44f0d698e Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 14:58:24 +0100 Subject: [PATCH 28/73] fix: tests --- src/metaxy/models/feature_spec.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 3b7b718d0..a48252b56 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -232,6 +232,7 @@ class _BaseFeatureSpec(pydantic.BaseModel): ) ] ) + metadata: dict[str, Any] | None = None class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): @@ -345,7 +346,7 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: return self @pydantic.model_validator(mode="after") - def validate_metadata_json_serializable(self) -> FeatureSpec: + def validate_metadata_json_serializable(self) -> BaseFeatureSpec[IDColumnsT]: """Validate that metadata is JSON-serializable. This ensures that metadata can be safely serialized for storage, From 858ac47c28b77d25ad508b181bd48ca5b85031fb Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:06:41 +0100 Subject: [PATCH 29/73] chore: refine --- CLAUDE.md | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index d4923531e..c6f296eda 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -272,13 +272,8 @@ This is critical for migrations when upstream dependencies change. #### Attaching Metadata to Features -Features can include user-defined metadata for documentation and tooling purposes: - -- **metadata parameter**: Optional dict on FeatureSpec for attaching arbitrary information -- **No effect on versioning**: metadata does NOT affect `feature_version()` or `code_version()` -- **Affects spec version**: metadata IS included in `feature_spec_version` for audit trail -- **Must be JSON-serializable**: Validated at initialization -- **Use cases**: Owner, team, SLA, description, tags, custom configuration +Additional metadata (JSON) can be attached to features via the `metadata` parameter on `FeatureSpec`. +Usecases may be for data governance such as ownership, SLAs, PII flags, ... etc. Example: From 855546a3a4d3f6a8bc4fed25ff0840bbca5ba8ca Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:07:34 +0100 Subject: [PATCH 30/73] chore: remove example --- CLAUDE.md | 60 ------------------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index c6f296eda..0c7f402d1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -275,66 +275,6 @@ This is critical for migrations when upstream dependencies change. Additional metadata (JSON) can be attached to features via the `metadata` parameter on `FeatureSpec`. Usecases may be for data governance such as ownership, SLAs, PII flags, ... etc. -Example: - -```python -class CustomerFeature( - Feature, - spec=FeatureSpec( - key=FeatureKey(["customer"]), - deps=[FeatureDep(key=FeatureKey(["user"]))], - fields=[ - FieldSpec(key=FieldKey(["age"]), code_version=1), - FieldSpec(key=FieldKey(["lifetime_value"]), code_version=1), - ], - metadata={ - "owner": "data-team", - "sla": "24h", - "description": "Customer profile enrichment", - "tags": ["customer", "profile", "enrichment"], - "pii": True, - "custom_config": { - "refresh_interval": "1h", - "alert_threshold": 0.95, - }, - }, - ), -): - pass - - -# Access metadata -CustomerFeature.spec.metadata["owner"] # "data-team" - - -# Metadata doesn't affect versioning (these produce the same feature_version): -class Feature1( - Feature, - spec=FeatureSpec( - key=FeatureKey(["example"]), - deps=[], - fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], - metadata={"owner": "team-a"}, - ), -): - pass - - -class Feature2( - Feature, - spec=FeatureSpec( - key=FeatureKey(["example"]), - deps=[], - fields=[FieldSpec(key=FieldKey(["value"]), code_version=1)], - metadata={"owner": "team-b"}, - ), -): - pass - - -assert Feature1.feature_version() == Feature2.feature_version() -``` - ## Important Constraints ### Narwhals as the Public Interface From 7861e09fb95866648a60ade3a651b80f54c5d571 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:22:05 +0100 Subject: [PATCH 31/73] chore :refine docs --- docs/learn/feature-definitions.md | 55 ++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/docs/learn/feature-definitions.md b/docs/learn/feature-definitions.md index 3901ff5a7..f9eccd4c0 100644 --- a/docs/learn/feature-definitions.md +++ b/docs/learn/feature-definitions.md @@ -1,23 +1,32 @@ # Feature System -Metaxy has a declarative (defined statically at class level), expressive, flexible feature system. It has been inspired by Software-Defined Assets in [Dagster](https://dagster.io/). +Metaxy has a declarative (defined statically at class level), expressive, flexible feature system. +It has been inspired by Software-Defined Assets in [Dagster](https://dagster.io/). Features represent tabular **metadata**, typically containing references to external multi-modal **data** such as files, images, or videos. But it can be just pure **metadata** as well. I will highlight **data** and **metadata** with bold so it really stands out. -Metaxy is responsible for providing correct **metadata** to users. During incremental processing, Metaxy will automatically resolve added, changed and deleted **metadata** rows and calculate the right [sample versions](data-versioning.md) for them. Metaxy does not interact with **data** directly, the user is responsible for writing it, typically using **metadata** to identify sample locations in storage (it's a good idea to inject the sample version into the data sample identifier). Metaxy is designed to be used with systems that do not overwrite existing **metadata** (Metaxy only appends **metadata**) and therefore **data** as well (while we cannot enforce that since the user is responsible for writing the data, it's easily achievable by **including the sample version into the data sample identifier**). +Metaxy is responsible for providing correct **metadata** to users. +During incremental processing, Metaxy will automatically resolve added, changed and deleted **metadata** rows and calculate the right [sample versions](data-versioning.md) for them. +Metaxy does not interact with **data** directly, the user is responsible for writing it, typically using **metadata** to identify sample locations in storage (it's a good idea to inject the sample version into the data sample identifier). +Metaxy is designed to be used with systems that do not overwrite existing **metadata** (Metaxy only appends **metadata**) and therefore **data** as well (while we cannot enforce that since the user is responsible for writing the data, it's easily achievable by **including the sample version into the data sample identifier**). I hope we can stop using bold for **data** and **metadata** from now on, hopefully we've made our point. > [!tip] Include Sample Version In Your Data Path > Include the sample version in your data path to ensure strong consistency guarantees. I mean it. Really do it! -Features live on a global `FeatureGraph` object (typically users do not need to interact with it directly). Features are bound to a specific Metaxy project, but can be moved between projects over time. Features must have unique (across all projects) `FeatureKey` associated with them. +Features live on a global `FeatureGraph` object (typically users do not need to interact with it directly). +Features are bound to a specific Metaxy project, but can be moved between projects over time. +Features must have unique (across all projects) `FeatureKey` associated with them. ## Feature Specs -Before we can define a `Feature`, we must first create a `FeatureSpec` object. But before we get to an example, it's necessary to understand the concept of ID columns. Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. Very often these ID columns would stay the same across many feature specs, therefore it makes a lot of sense to define them on a shared base class. +Before we can define a `Feature`, we must first create a `FeatureSpec` object. +But before we get to an example, it's necessary to understand the concept of ID columns. +Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. +Very often these ID columns would stay the same across many feature specs, therefore it makes a lot of sense to define them on a shared base class. Some boilerplate with typing is involved (this is typically a good thing): @@ -36,13 +45,17 @@ class VideoFeatureSpec(BaseFeatureSpec[VideoIds]): `BaseFeatureSpec` is a [Pydantic](https://docs.pydantic.dev/latest/) model, so all normal Pydantic features apply. -Feature specs now support an optional `metadata` dictionary for attaching ownership, documentation, or tooling context to a feature. This metadata **never** influences graph topology or version hashes, must be JSON-serializable, and is stored as an immutable [`frozendict`](https://pypi.org/project/frozendict/) once the spec is created (list values are frozen as tuples to guarantee immutability). It is ideal for values such as owners, SLAs, runbooks, or tags that external systems may want to inspect. +Feature specs now support an optional `metadata` dictionary for attaching ownership, documentation, or tooling context to a feature. +This metadata **never** influences graph topology or version hashes, must be JSON-serializable, and is immutable once the spec is created. +It is ideal for values such as owners, SLAs, runbooks, or tags that external systems may want to inspect. With our `VideoFeatureSpec` in place, we can proceed to defining features that would be using it. ## Feature Definitions -Metaxy provides a `BaseFeature` class that can be extended to make user-defined features. It's a Pydantic model as well. User-defined `BaseFeature` classes must have fields matching ID columns of the `FeatureSpec` they are using. +Metaxy provides a `BaseFeature` class that can be extended to make user-defined features. +It's a Pydantic model as well. +User-defined `BaseFeature` classes must have fields matching ID columns of the `FeatureSpec` they are using. With respect to the same DRY principle, we can define a shared base class for features that use the `VideoFeatureSpec`. @@ -63,7 +76,9 @@ class VideoFeature(BaseVideoFeature, spec=VideoFeatureSpec(key="/raw/video")): path: str ``` -That's it! That's a roow feature, it doesn't have any dependencies. Easy. +That's it! +That's a raw single feature, it doesn't have any dependencies. +Easy. You may now use `VideoFeature.spec()` class method to access the original feature spec: it's bound to the class. @@ -83,17 +98,23 @@ Hurray! You get the idea. ## Field-Level Dependencies -A core (I'be straight: a killer) feature of Metaxy is the concept of **field-level dependencies**. These are used to define dependencies between logical fields of features. +A core (I'be straight: a killer) feature of Metaxy is the concept of **field-level dependencies**. +These are used to define dependencies between logical fields of features. -A **field** is not to be confused with metadata _column_ (Pydantic fields). Fields are completely independent from them. +A **field** is not to be confused with metadata _column_ (Pydantic fields). +Fields are completely independent from them. Columns refer to _metadata_ and are stored in metadata stores (such as databases) supported by Metaxy. -Fields refer to _data_ and are logical -- users are free to define them as they see fit. Fields are supposed to represent parts of data that users care about. For example, a `Video` feature -- an `.mp4` file -- may have `frames` and `audio` fields. +Fields refer to _data_ and are logical -- users are free to define them as they see fit. +Fields are supposed to represent parts of data that users care about. +For example, a `Video` feature -- an `.mp4` file -- may have `frames` and `audio` fields. -Downstream features can depend on specific fields of upstream features. This enables fine-grained control over data versioning, avoiding unnecessary reprocessing. +Downstream features can depend on specific fields of upstream features. +This enables fine-grained control over data versioning, avoiding unnecessary reprocessing. -At this point, careful readers have probably noticed that the `Transcript` feature from the [example](#feature-specs) above should not depend on the full video: it only needs the audio track in order to generate the transcript. Let's express that with Metaxy: +At this point, careful readers have probably noticed that the `Transcript` feature from the [example](#feature-specs) above should not depend on the full video: it only needs the audio track in order to generate the transcript. +Let's express that with Metaxy: ```py from metaxy import FieldDep, FieldSpec @@ -116,13 +137,16 @@ The [Data Versioning](data-versioning.md) docs explain more about this system. ### Fully Qualified Field Key -A **fully qualified field key (FQFK)** is an identifier that uniquely identifies a field within the whole feature graph. It consists of the **feature key** and the **field key**, separated by a colon, for example: `/raw/video:frames`, `/raw/video:audio/english`. +A **fully qualified field key (FQFK)** is an identifier that uniquely identifies a field within the whole feature graph. +It consists of the **feature key** and the **field key**, separated by a colon, for example: `/raw/video:frames`, `/raw/video:audio/english`. ## A Note on Type Coercion for Metaxy types Internally, Metaxy uses strongly typed Pydantic models to represent feature keys, their fields, and the dependencies between them. -To avoid boilerplate, Metaxy also has syntactic sugar for construction of these classes. Different ways to provide them are automatically coerced into canonical internal models. This is fully typed and only affects **constructor arguments**, so accessing **attributes** on Metaxy models will always return only the canonical types. +To avoid boilerplate, Metaxy also has syntactic sugar for construction of these classes. +Different ways to provide them are automatically coerced into canonical internal models. +This is fully typed and only affects **constructor arguments**, so accessing **attributes** on Metaxy models will always return only the canonical types. Some examples: @@ -135,7 +159,8 @@ key = FeatureKey("prefix", "feature") same_key = FeatureKey(key) ``` -Metaxy really loves you, the user! See [syntactic sugar](#syntactic-sugar) for more details. +Metaxy really loves you, the user! +See [syntactic sugar](#syntactic-sugar) for more details. ## Syntactic Sugar From c40b928ad7ab675a7d405f4a3d3077f365eeea61 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:24:47 +0100 Subject: [PATCH 32/73] chore: remove diff --- src/metaxy/models/feature.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index 756f8cbab..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -827,12 +827,6 @@ class BaseFeature( graph: ClassVar[FeatureGraph] project: ClassVar[str] code_version: ClassVar[str] = _CodeVersionDescriptor() # pyright: ignore[reportAssignmentType] - """Hash of this feature's field code versions only (excludes dependencies). - - Useful for detecting when the feature's own logic changes while remaining stable - if only upstream dependencies mutate; contrast with feature_version(), which - includes dependency hashes. - """ # once ClassVar supports it # this should be replaced by From 75f5d3938f3af7d24e6d2fb6cc4ee2cb02ae7042 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:27:55 +0100 Subject: [PATCH 33/73] fix: cleanup --- src/metaxy/utils/typing.py | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 src/metaxy/utils/typing.py diff --git a/src/metaxy/utils/typing.py b/src/metaxy/utils/typing.py deleted file mode 100644 index 460c98e36..000000000 --- a/src/metaxy/utils/typing.py +++ /dev/null @@ -1,14 +0,0 @@ -class _CodeVersionDescriptor: - """Descriptor returning this feature's field-only code version hash. - - The hash is cached on the feature spec (`FeatureSpec.field_code_version_hash`) - and excludes any dependency information, allowing callers to distinguish - between "my code changed" and "one of my dependencies changed". - """ - - def __get__(self, instance, owner) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash From cd25a2f8c8e3b5da6b40fbf5cb1262407fd93306 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:34:06 +0100 Subject: [PATCH 34/73] fix: address reviewers comments --- src/metaxy/models/feature_spec.py | 90 ++++++++++++++++--------------- tests/test_feature_metadata.py | 17 ++++-- 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index a48252b56..c11364a29 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,6 +4,7 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property +from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -18,8 +19,7 @@ ) import pydantic -from frozendict import frozendict -from pydantic import BeforeValidator +from pydantic import BeforeValidator, Json from typing_extensions import Self from metaxy.models.field import FieldSpec, SpecialFieldDep @@ -199,12 +199,19 @@ def table_name(self) -> str: return self.feature.table_name +IDColumns: TypeAlias = Sequence[ + str +] # non-bound, should be used for feature specs with arbitrary id columns +IDColumnsT = TypeVar( + "IDColumnsT", bound=IDColumns, covariant=True +) # bound, should be used for generic + + def _freeze_metadata(value: Any) -> Any: - """Recursively freeze metadata containers to enforce immutability.""" - if isinstance(value, frozendict): - return value + """Recursively convert metadata to immutable containers.""" if isinstance(value, Mapping): - return frozendict({k: _freeze_metadata(v) for k, v in value.items()}) + frozen_dict = {k: _freeze_metadata(v) for k, v in value.items()} + return MappingProxyType(frozen_dict) if isinstance(value, list): return tuple(_freeze_metadata(v) for v in value) if isinstance(value, tuple): @@ -212,12 +219,14 @@ def _freeze_metadata(value: Any) -> Any: return value -IDColumns: TypeAlias = Sequence[ - str -] # non-bound, should be used for feature specs with arbitrary id columns -IDColumnsT = TypeVar( - "IDColumnsT", bound=IDColumns, covariant=True -) # bound, should be used for generic +def _thaw_metadata(value: Any) -> Any: + if isinstance(value, MappingProxyType): + return {k: _thaw_metadata(v) for k, v in value.items()} + if isinstance(value, tuple): + return [_thaw_metadata(v) for v in value] + if isinstance(value, list): + return [_thaw_metadata(v) for v in value] + return value class _BaseFeatureSpec(pydantic.BaseModel): @@ -232,12 +241,36 @@ class _BaseFeatureSpec(pydantic.BaseModel): ) ] ) + metadata: Json[dict[str, Any]] = pydantic.Field( + default_factory=dict, + description="Metadata attached to this feature.", + ) metadata: dict[str, Any] | None = None class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): id_columns: pydantic.SkipValidation[IDColumnsT] + @pydantic.model_validator(mode="before") + @classmethod + def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: + # Allow callers to omit metadata or pass None while keeping the field non-optional. + if "metadata" in values and values["metadata"] is None: + values.pop("metadata", None) + elif "metadata" in values: + try: + json.dumps(_thaw_metadata(values["metadata"])) + except (TypeError, ValueError) as exc: + raise ValueError( + "metadata must be JSON-serializable. Found non-serializable value" + ) from exc + return values + + @pydantic.model_validator(mode="after") + def _freeze_metadata_field(self) -> BaseFeatureSpec[IDColumnsT]: + object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) + return self + @overload def __init__( self, @@ -246,7 +279,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, ) -> None: """Initialize from string key.""" ... @@ -259,7 +292,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, ) -> None: """Initialize from sequence of parts.""" ... @@ -272,7 +305,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: dict[str, Any] | None = None, + metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, ) -> None: """Initialize from FeatureKey instance.""" ... @@ -345,33 +378,6 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: ) return self - @pydantic.model_validator(mode="after") - def validate_metadata_json_serializable(self) -> BaseFeatureSpec[IDColumnsT]: - """Validate that metadata is JSON-serializable. - - This ensures that metadata can be safely serialized for storage, - transmission, and graph snapshots. - - Note: Metadata is kept as a mutable dict for Pydantic serialization compatibility, - but users should treat it as immutable. The frozen FeatureSpec model prevents - reassignment of the metadata field itself. - - Raises: - ValueError: If metadata contains non-JSON-serializable types - """ - if self.metadata is not None: - try: - # Attempt to serialize and deserialize to validate - json.dumps(self.metadata) - except (TypeError, ValueError) as e: - raise ValueError( - f"metadata must be JSON-serializable. " - f"Found non-serializable value: {e}" - ) from e - object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) - - return self - @property def feature_spec_version(self) -> str: """Compute SHA256 hash of the complete feature specification. diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py index 4c880a9ab..c4634de52 100644 --- a/tests/test_feature_metadata.py +++ b/tests/test_feature_metadata.py @@ -1,7 +1,8 @@ import json +from collections.abc import Sequence +from types import MappingProxyType import pytest -from frozendict import frozendict from metaxy import Feature, FeatureKey, FeatureSpec from metaxy.models.feature import FeatureGraph @@ -60,9 +61,9 @@ def test_metadata_json_serializable() -> None: metadata=valid_metadata, ) assert spec.metadata is not None - assert isinstance(spec.metadata, frozendict) + assert isinstance(spec.metadata, MappingProxyType) assert isinstance(spec.metadata["list"], tuple) - assert json.dumps(dict(spec.metadata)) is not None + assert json.dumps(_thaw(spec.metadata)) is not None with pytest.raises(ValueError): FeatureSpec( @@ -83,3 +84,13 @@ def test_metadata_immutable() -> None: with pytest.raises(TypeError): spec.metadata["key"] = "new_value" # type: ignore[index] + + +def _thaw(value): + if isinstance(value, MappingProxyType): + return {k: _thaw(v) for k, v in value.items()} + if isinstance(value, tuple): + return [_thaw(v) for v in value] + if isinstance(value, Sequence) and not isinstance(value, (str, bytes)): + return [_thaw(v) for v in value] + return value From 8aa3a9d1fb0a3d667412c7d2ba4e6d7c80fcbdc6 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:39:19 +0100 Subject: [PATCH 35/73] fix: remove frozen dict --- pyproject.toml | 1 - uv.lock | 21 --------------------- 2 files changed, 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 53b6623db..7161c27f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ authors = [ requires-python = ">=3.10" dependencies = [ "cyclopts==4.0.0b1", - "frozendict>=2.4.4", "narwhals>=2.9.0", "polars>=1.33.1", "polars-hash>=0.5.1", diff --git a/uv.lock b/uv.lock index 26f464e48..c46262fbd 100644 --- a/uv.lock +++ b/uv.lock @@ -518,25 +518,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] -[[package]] -name = "frozendict" -version = "2.4.6" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bb/59/19eb300ba28e7547538bdf603f1c6c34793240a90e1a7b61b65d8517e35e/frozendict-2.4.6.tar.gz", hash = "sha256:df7cd16470fbd26fc4969a208efadc46319334eb97def1ddf48919b351192b8e", size = 316416, upload-time = "2024-10-13T12:15:32.449Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a6/7f/e80cdbe0db930b2ba9d46ca35a41b0150156da16dfb79edcc05642690c3b/frozendict-2.4.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c3a05c0a50cab96b4bb0ea25aa752efbfceed5ccb24c007612bc63e51299336f", size = 37927, upload-time = "2024-10-13T12:14:17.927Z" }, - { url = "https://files.pythonhosted.org/packages/29/98/27e145ff7e8e63caa95fb8ee4fc56c68acb208bef01a89c3678a66f9a34d/frozendict-2.4.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5b94d5b07c00986f9e37a38dd83c13f5fe3bf3f1ccc8e88edea8fe15d6cd88c", size = 37945, upload-time = "2024-10-13T12:14:19.976Z" }, - { url = "https://files.pythonhosted.org/packages/ac/f1/a10be024a9d53441c997b3661ea80ecba6e3130adc53812a4b95b607cdd1/frozendict-2.4.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c789fd70879ccb6289a603cdebdc4953e7e5dea047d30c1b180529b28257b5", size = 117656, upload-time = "2024-10-13T12:14:22.038Z" }, - { url = "https://files.pythonhosted.org/packages/46/a6/34c760975e6f1cb4db59a990d58dcf22287e10241c851804670c74c6a27a/frozendict-2.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da6a10164c8a50b34b9ab508a9420df38f4edf286b9ca7b7df8a91767baecb34", size = 117444, upload-time = "2024-10-13T12:14:24.251Z" }, - { url = "https://files.pythonhosted.org/packages/62/dd/64bddd1ffa9617f50e7e63656b2a7ad7f0a46c86b5f4a3d2c714d0006277/frozendict-2.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9a8a43036754a941601635ea9c788ebd7a7efbed2becba01b54a887b41b175b9", size = 116801, upload-time = "2024-10-13T12:14:26.518Z" }, - { url = "https://files.pythonhosted.org/packages/45/ae/af06a8bde1947277aad895c2f26c3b8b8b6ee9c0c2ad988fb58a9d1dde3f/frozendict-2.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9905dcf7aa659e6a11b8051114c9fa76dfde3a6e50e6dc129d5aece75b449a2", size = 117329, upload-time = "2024-10-13T12:14:28.485Z" }, - { url = "https://files.pythonhosted.org/packages/d2/df/be3fa0457ff661301228f4c59c630699568c8ed9b5480f113b3eea7d0cb3/frozendict-2.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:323f1b674a2cc18f86ab81698e22aba8145d7a755e0ac2cccf142ee2db58620d", size = 37522, upload-time = "2024-10-13T12:14:30.418Z" }, - { url = "https://files.pythonhosted.org/packages/4a/6f/c22e0266b4c85f58b4613fec024e040e93753880527bf92b0c1bc228c27c/frozendict-2.4.6-cp310-cp310-win_arm64.whl", hash = "sha256:eabd21d8e5db0c58b60d26b4bb9839cac13132e88277e1376970172a85ee04b3", size = 34056, upload-time = "2024-10-13T12:14:31.757Z" }, - { url = "https://files.pythonhosted.org/packages/04/13/d9839089b900fa7b479cce495d62110cddc4bd5630a04d8469916c0e79c5/frozendict-2.4.6-py311-none-any.whl", hash = "sha256:d065db6a44db2e2375c23eac816f1a022feb2fa98cbb50df44a9e83700accbea", size = 16148, upload-time = "2024-10-13T12:15:26.839Z" }, - { url = "https://files.pythonhosted.org/packages/ba/d0/d482c39cee2ab2978a892558cf130681d4574ea208e162da8958b31e9250/frozendict-2.4.6-py312-none-any.whl", hash = "sha256:49344abe90fb75f0f9fdefe6d4ef6d4894e640fadab71f11009d52ad97f370b9", size = 16146, upload-time = "2024-10-13T12:15:28.16Z" }, - { url = "https://files.pythonhosted.org/packages/a5/8e/b6bf6a0de482d7d7d7a2aaac8fdc4a4d0bb24a809f5ddd422aa7060eb3d2/frozendict-2.4.6-py313-none-any.whl", hash = "sha256:7134a2bb95d4a16556bb5f2b9736dceb6ea848fa5b6f3f6c2d6dba93b44b4757", size = 16146, upload-time = "2024-10-13T12:15:29.495Z" }, -] - [[package]] name = "ghp-import" version = "2.1.0" @@ -1018,7 +999,6 @@ version = "0.0.0" source = { editable = "." } dependencies = [ { name = "cyclopts" }, - { name = "frozendict" }, { name = "narwhals" }, { name = "polars" }, { name = "polars-hash" }, @@ -1084,7 +1064,6 @@ docs = [ [package.metadata] requires-dist = [ { name = "cyclopts", git = "https://github.com/BrianPugh/cyclopts.git?branch=mkdocs-plugin" }, - { name = "frozendict", specifier = ">=2.4.4" }, { name = "ibis-framework", marker = "extra == 'ibis'", specifier = ">=11.0.0" }, { name = "mermaid-py", marker = "extra == 'mermaid'", specifier = ">=0.8.0" }, { name = "narwhals", specifier = ">=2.9.0" }, From 76952243aecb969e14afe1b3b7687046eaef6c5e Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:46:23 +0100 Subject: [PATCH 36/73] fix: ensure FrozenBaseModel is used --- src/metaxy/models/feature_spec.py | 48 ++++++++----------------------- tests/test_feature_metadata.py | 22 ++++---------- 2 files changed, 17 insertions(+), 53 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index c11364a29..368f33d6e 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,7 +4,6 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property -from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -19,9 +18,11 @@ ) import pydantic -from pydantic import BeforeValidator, Json +from pydantic import BeforeValidator +from pydantic.types import JsonValue from typing_extensions import Self +from metaxy.models.bases import FrozenBaseModel from metaxy.models.field import FieldSpec, SpecialFieldDep from metaxy.models.types import ( CoercibleToFeatureKey, @@ -207,29 +208,7 @@ def table_name(self) -> str: ) # bound, should be used for generic -def _freeze_metadata(value: Any) -> Any: - """Recursively convert metadata to immutable containers.""" - if isinstance(value, Mapping): - frozen_dict = {k: _freeze_metadata(v) for k, v in value.items()} - return MappingProxyType(frozen_dict) - if isinstance(value, list): - return tuple(_freeze_metadata(v) for v in value) - if isinstance(value, tuple): - return tuple(_freeze_metadata(v) for v in value) - return value - - -def _thaw_metadata(value: Any) -> Any: - if isinstance(value, MappingProxyType): - return {k: _thaw_metadata(v) for k, v in value.items()} - if isinstance(value, tuple): - return [_thaw_metadata(v) for v in value] - if isinstance(value, list): - return [_thaw_metadata(v) for v in value] - return value - - -class _BaseFeatureSpec(pydantic.BaseModel): +class _BaseFeatureSpec(FrozenBaseModel): key: Annotated[FeatureKey, BeforeValidator(FeatureKeyAdapter.validate_python)] deps: list[FeatureDep] | None = None fields: list[FieldSpec] = pydantic.Field( @@ -241,11 +220,10 @@ class _BaseFeatureSpec(pydantic.BaseModel): ) ] ) - metadata: Json[dict[str, Any]] = pydantic.Field( + metadata: JsonValue = pydantic.Field( default_factory=dict, description="Metadata attached to this feature.", ) - metadata: dict[str, Any] | None = None class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): @@ -258,19 +236,17 @@ def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: if "metadata" in values and values["metadata"] is None: values.pop("metadata", None) elif "metadata" in values: + metadata_value = values["metadata"] + if not isinstance(metadata_value, Mapping): + raise ValueError("metadata must be a mapping") try: - json.dumps(_thaw_metadata(values["metadata"])) + json.dumps(metadata_value) except (TypeError, ValueError) as exc: raise ValueError( "metadata must be JSON-serializable. Found non-serializable value" ) from exc return values - @pydantic.model_validator(mode="after") - def _freeze_metadata_field(self) -> BaseFeatureSpec[IDColumnsT]: - object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) - return self - @overload def __init__( self, @@ -279,7 +255,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from string key.""" ... @@ -292,7 +268,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from sequence of parts.""" ... @@ -305,7 +281,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, - metadata: Json[dict[str, Any]] | dict[str, Any] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from FeatureKey instance.""" ... diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py index c4634de52..e9ecd18bb 100644 --- a/tests/test_feature_metadata.py +++ b/tests/test_feature_metadata.py @@ -1,6 +1,4 @@ import json -from collections.abc import Sequence -from types import MappingProxyType import pytest @@ -61,9 +59,9 @@ def test_metadata_json_serializable() -> None: metadata=valid_metadata, ) assert spec.metadata is not None - assert isinstance(spec.metadata, MappingProxyType) - assert isinstance(spec.metadata["list"], tuple) - assert json.dumps(_thaw(spec.metadata)) is not None + assert isinstance(spec.metadata, dict) + assert isinstance(spec.metadata["list"], list) + assert json.dumps(spec.metadata) is not None with pytest.raises(ValueError): FeatureSpec( @@ -82,15 +80,5 @@ def test_metadata_immutable() -> None: ) assert spec.metadata is not None - with pytest.raises(TypeError): - spec.metadata["key"] = "new_value" # type: ignore[index] - - -def _thaw(value): - if isinstance(value, MappingProxyType): - return {k: _thaw(v) for k, v in value.items()} - if isinstance(value, tuple): - return [_thaw(v) for v in value] - if isinstance(value, Sequence) and not isinstance(value, (str, bytes)): - return [_thaw(v) for v in value] - return value + with pytest.raises(Exception): + spec.metadata = {"key": "new_value"} # type: ignore[assignment] From 847c13757101f44d6389a9b83b6986a4ce3c04f9 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 16:02:14 +0100 Subject: [PATCH 37/73] fix: refine --- src/metaxy/models/feature_spec.py | 45 +++++++++++++++++-------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 368f33d6e..e09378651 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -13,6 +13,7 @@ Protocol, TypeAlias, TypeVar, + cast, overload, runtime_checkable, ) @@ -208,6 +209,26 @@ def table_name(self) -> str: ) # bound, should be used for generic +def _coerce_metadata(value: Any) -> dict[str, JsonValue] | None: + if value is None: + return None + if not isinstance(value, Mapping): + raise ValueError("metadata must be a mapping") + try: + serialized = json.dumps(value) + except (TypeError, ValueError) as exc: + raise ValueError( + "metadata must be JSON-serializable. Found non-serializable value" + ) from exc + return cast(dict[str, JsonValue], json.loads(serialized)) + + +MetadataField = Annotated[ + dict[str, JsonValue] | None, + BeforeValidator(_coerce_metadata), +] + + class _BaseFeatureSpec(FrozenBaseModel): key: Annotated[FeatureKey, BeforeValidator(FeatureKeyAdapter.validate_python)] deps: list[FeatureDep] | None = None @@ -220,8 +241,8 @@ class _BaseFeatureSpec(FrozenBaseModel): ) ] ) - metadata: JsonValue = pydantic.Field( - default_factory=dict, + metadata: MetadataField = pydantic.Field( + default=None, description="Metadata attached to this feature.", ) @@ -229,24 +250,6 @@ class _BaseFeatureSpec(FrozenBaseModel): class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): id_columns: pydantic.SkipValidation[IDColumnsT] - @pydantic.model_validator(mode="before") - @classmethod - def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: - # Allow callers to omit metadata or pass None while keeping the field non-optional. - if "metadata" in values and values["metadata"] is None: - values.pop("metadata", None) - elif "metadata" in values: - metadata_value = values["metadata"] - if not isinstance(metadata_value, Mapping): - raise ValueError("metadata must be a mapping") - try: - json.dumps(metadata_value) - except (TypeError, ValueError) as exc: - raise ValueError( - "metadata must be JSON-serializable. Found non-serializable value" - ) from exc - return values - @overload def __init__( self, @@ -381,6 +384,8 @@ def feature_spec_version(self) -> str: # Use model_dump with mode="json" for deterministic serialization # This ensures all types (like FeatureKey) are properly serialized spec_dict = self.model_dump(mode="json") + if spec_dict.get("metadata") == {}: + spec_dict.pop("metadata", None) # Sort keys to ensure deterministic ordering spec_json = json.dumps(spec_dict, sort_keys=True) From 128fffd336901e8d8c1ece8f3f7abb2b68b2d69b Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 21:36:56 +0100 Subject: [PATCH 38/73] chore: k --- docs/learn/feature-definitions.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/learn/feature-definitions.md b/docs/learn/feature-definitions.md index f9eccd4c0..b3fe54e35 100644 --- a/docs/learn/feature-definitions.md +++ b/docs/learn/feature-definitions.md @@ -3,7 +3,8 @@ Metaxy has a declarative (defined statically at class level), expressive, flexible feature system. It has been inspired by Software-Defined Assets in [Dagster](https://dagster.io/). -Features represent tabular **metadata**, typically containing references to external multi-modal **data** such as files, images, or videos. But it can be just pure **metadata** as well. +Features represent tabular **metadata**, typically containing references to external multi-modal **data** such as files, images, or videos. +But it can be just pure **metadata** as well. I will highlight **data** and **metadata** with bold so it really stands out. @@ -14,8 +15,10 @@ Metaxy is designed to be used with systems that do not overwrite existing **meta I hope we can stop using bold for **data** and **metadata** from now on, hopefully we've made our point. -> [!tip] Include Sample Version In Your Data Path -> Include the sample version in your data path to ensure strong consistency guarantees. I mean it. Really do it! +> [!tip] Include sample version in your data path +> Include the sample version in your data path to ensure strong consistency guarantees. +> I mean it. +> Really do it! Features live on a global `FeatureGraph` object (typically users do not need to interact with it directly). Features are bound to a specific Metaxy project, but can be moved between projects over time. @@ -25,7 +28,7 @@ Features must have unique (across all projects) `FeatureKey` associated with the Before we can define a `Feature`, we must first create a `FeatureSpec` object. But before we get to an example, it's necessary to understand the concept of ID columns. -Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. +Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. Very often these ID columns would stay the same across many feature specs, therefore it makes a lot of sense to define them on a shared base class. Some boilerplate with typing is involved (this is typically a good thing): From 344d7a67744c77d4bf57bb877f76b718af912ef0 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:09:32 +0100 Subject: [PATCH 39/73] fix: update tests --- tests/__snapshots__/test_spec_version.ambr | 17 +++-------------- .../examples/__snapshots__/test_recompute.ambr | 16 ++++++++-------- .../__snapshots__/test_snapshot_push.ambr | 12 ++---------- 3 files changed, 13 insertions(+), 32 deletions(-) diff --git a/tests/__snapshots__/test_spec_version.ambr b/tests/__snapshots__/test_spec_version.ambr index 21d30fbd8..eba50e02c 100644 --- a/tests/__snapshots__/test_spec_version.ambr +++ b/tests/__snapshots__/test_spec_version.ambr @@ -3,11 +3,7 @@ 'b0741037ecd180b5948761b9d73b4dd78efa9cf7224240d3e3b718bae26e5852' # --- # name: test_feature_spec_version_includes_all_properties -<<<<<<< HEAD - '9421fb545f2ace1ffb1e07c2d14cc62427d0d261a3b509c8a1473f574efc8cce' -======= - '57050f5f60b0044e84c6b2e82c12dc3fa1bbe03e2db4230255356268a6a50edc' ->>>>>>> 768008a (chore: savepoint) + '8b54ae3cd476f3071a9437d4626658c15e40005380a0e2093585d442866e167b' # --- # name: test_feature_spec_version_recorded_in_metadata_store dict({ @@ -47,15 +43,8 @@ }) # --- # name: test_feature_spec_version_with_column_selection_and_rename -<<<<<<< HEAD - '9bdc3eb37440620d259db2709010b3d920a7c3147ba30d370494cb3fecc787fc' + 'a412fd83810799e529173eed40fbcc29ed9adaed8c37ac4ffb21edd4f47c60d1' # --- # name: test_feature_spec_version_with_multiple_complex_deps - '3a24675ffa9a6a053dd6738cfe1706470cabc458ec5702d5837138a205bdf4fc' -======= - '00612bc7d53f750ed1c293f7fa7af30156957d643dd4a737501bf1ed7fbb30fe' -# --- -# name: test_feature_spec_version_with_multiple_complex_deps - '1ed84211824fea4af008270ecec7fe3307473a0b19a48f29fd9014b680568882' ->>>>>>> 768008a (chore: savepoint) + '09d45d5f79ef381519aad8a30dae3c2bb253f8bc2e61c63114b8ac48fa8efcbc' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 0e4bdc1a3..13f6cf1c9 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -2,18 +2,18 @@ # name: test_list_features[1] ''' --- - examples/parent (version + examples/parent (version 0aad9b8a2ea055cde3c4149fc4cde576e6478a982cea75b45c3cd012db43a5e8) Fields: - embeddings (code_version 1, version + embeddings (code_version 1, version 05e66510da58ef37168095b60188849cd6b1f0a4b539d0ac29ffd1e15b756459) --- - examples/child (version + examples/child (version 440ffb028aaa5cb21b155c4ef21debd81f283f99aa91ef58cbe541d71164b44f) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) ''' @@ -21,18 +21,18 @@ # name: test_list_features[2] ''' --- - examples/parent (version + examples/parent (version a007f308d0a852e3fbf80a442bd0089e29eed94efefe85231ef4fc927aa7d737) Fields: - embeddings (code_version 2, version + embeddings (code_version 2, version 3c8d3e9ba031ab3613eb4db0877d3959fce76d94d625335b89bae7fbd4f27add) --- - examples/child (version + examples/child (version 7251e21c32d2d8e35a8ba389a8ce1b597663f206dee0ff55e542a3af1f1665cd) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) ''' diff --git a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr index 101319f65..d3eedb291 100644 --- a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr +++ b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr @@ -29,11 +29,7 @@ 'versions': list([ dict({ 'feature_key': 'downstream', -<<<<<<< HEAD - 'feature_spec_version': 'c7e67b870148746be6b9caf37ea7fc6c46acb858c8ec31ff2f2961f0a5ac1754', -======= - 'feature_spec_version': '94ac4803ccb1618a54c2621f2cac20c0021da72d16fa89f84594293e6858e8e0', ->>>>>>> 768008a (chore: savepoint) + 'feature_spec_version': '20d33a795e3990ca63dbde8a5cd9ee3ab48738c4c1f6375cdcbc638202c505e5', 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), @@ -45,11 +41,7 @@ }), dict({ 'feature_key': 'downstream', -<<<<<<< HEAD - 'feature_spec_version': 'bd492837d4324369f47605ee647b7aaeb5db68ac240a85a40fbd9e59351afa2a', -======= - 'feature_spec_version': '0f0db4be0886aefa1a7885420c592baff273f0429fdcd9644702c469b849d139', ->>>>>>> 768008a (chore: savepoint) + 'feature_spec_version': '8ba901bb7efebf6b75ab81334237508b6e399f5515d818e3f76a4cd6421991a4', 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), From bab86a25452be790576bd24d9bf7841881a1fea1 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 20:29:16 +0100 Subject: [PATCH 40/73] fix: refine implementation --- src/metaxy/models/feature.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..a52fe5985 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,6 +82,17 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash + + class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From e18753e9bd3470d2be49b6bb36148e3e6ae5f990 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:07:47 +0100 Subject: [PATCH 41/73] chore: cleanup --- src/metaxy/models/feature.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index a52fe5985..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,17 +82,6 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash - - class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 7ab3718d6f982f78f011d38545ff77ca6a2b93bc Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 20:29:16 +0100 Subject: [PATCH 42/73] fix: refine implementation --- src/metaxy/models/feature.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..a52fe5985 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,6 +82,17 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash + + class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 2e5b22e9847cf9a6953b6ac4a9f8306c81ca04ab Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:07:47 +0100 Subject: [PATCH 43/73] chore: cleanup --- src/metaxy/models/feature.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index a52fe5985..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,17 +82,6 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash - - class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From c2d48f4c472b3808ea7ec868ec306170c9fcd97b Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 21:57:09 +0100 Subject: [PATCH 44/73] feat: #66 Add metadata parameter to FeatureSpec for user-defined information https://github.com/anam-org/metaxy/issues/66 --- src/metaxy/models/feature_spec.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index e09378651..138f20043 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -357,6 +357,32 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: ) return self + @pydantic.model_validator(mode="after") + def validate_metadata_json_serializable(self) -> "FeatureSpec": + """Validate that metadata is JSON-serializable. + + This ensures that metadata can be safely serialized for storage, + transmission, and graph snapshots. + + Note: Metadata is kept as a mutable dict for Pydantic serialization compatibility, + but users should treat it as immutable. The frozen FeatureSpec model prevents + reassignment of the metadata field itself. + + Raises: + ValueError: If metadata contains non-JSON-serializable types + """ + if self.metadata is not None: + try: + # Attempt to serialize and deserialize to validate + json.dumps(self.metadata) + except (TypeError, ValueError) as e: + raise ValueError( + f"metadata must be JSON-serializable. " + f"Found non-serializable value: {e}" + ) from e + + return self + @property def feature_spec_version(self) -> str: """Compute SHA256 hash of the complete feature specification. From 749120be4cd42e387cbb498cbed03908214cb7c4 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 22:15:18 +0100 Subject: [PATCH 45/73] fix: refine snapshots again with pre-commit hook --- tests/cli/__snapshots__/test_cli_graph.ambr | 8 ++++---- tests/examples/__snapshots__/test_recompute.ambr | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/cli/__snapshots__/test_cli_graph.ambr b/tests/cli/__snapshots__/test_cli_graph.ambr index c925e2837..e60e527f5 100644 --- a/tests/cli/__snapshots__/test_cli_graph.ambr +++ b/tests/cli/__snapshots__/test_cli_graph.ambr @@ -7,17 +7,17 @@ labelloc=t; fontsize=14; fontname=helvetica; - + "examples/parent" ; "examples/child" ; - + "examples/parent" -> "examples/child"; - + "examples/parent::embeddings" ; "examples/parent" -> "examples/parent::embeddings" ; "examples/child::predictions" ; "examples/child" -> "examples/child::predictions" ; } - + ''' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 13f6cf1c9..580ff420b 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -15,7 +15,7 @@ Fields: predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) - + ''' # --- # name: test_list_features[2] @@ -34,6 +34,6 @@ Fields: predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) - + ''' # --- From 0ed26107238668e7c498882daaec105c2e905d33 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Wed, 29 Oct 2025 22:27:51 +0100 Subject: [PATCH 46/73] fix: snapshot newlines --- tests/cli/__snapshots__/test_cli_graph.ambr | 8 ++++---- .../__snapshots__/test_recompute.ambr | 20 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/cli/__snapshots__/test_cli_graph.ambr b/tests/cli/__snapshots__/test_cli_graph.ambr index e60e527f5..c925e2837 100644 --- a/tests/cli/__snapshots__/test_cli_graph.ambr +++ b/tests/cli/__snapshots__/test_cli_graph.ambr @@ -7,17 +7,17 @@ labelloc=t; fontsize=14; fontname=helvetica; - + "examples/parent" ; "examples/child" ; - + "examples/parent" -> "examples/child"; - + "examples/parent::embeddings" ; "examples/parent" -> "examples/parent::embeddings" ; "examples/child::predictions" ; "examples/child" -> "examples/child::predictions" ; } - + ''' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 580ff420b..0e4bdc1a3 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -2,38 +2,38 @@ # name: test_list_features[1] ''' --- - examples/parent (version + examples/parent (version 0aad9b8a2ea055cde3c4149fc4cde576e6478a982cea75b45c3cd012db43a5e8) Fields: - embeddings (code_version 1, version + embeddings (code_version 1, version 05e66510da58ef37168095b60188849cd6b1f0a4b539d0ac29ffd1e15b756459) --- - examples/child (version + examples/child (version 440ffb028aaa5cb21b155c4ef21debd81f283f99aa91ef58cbe541d71164b44f) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) - + ''' # --- # name: test_list_features[2] ''' --- - examples/parent (version + examples/parent (version a007f308d0a852e3fbf80a442bd0089e29eed94efefe85231ef4fc927aa7d737) Fields: - embeddings (code_version 2, version + embeddings (code_version 2, version 3c8d3e9ba031ab3613eb4db0877d3959fce76d94d625335b89bae7fbd4f27add) --- - examples/child (version + examples/child (version 7251e21c32d2d8e35a8ba389a8ce1b597663f206dee0ff55e542a3af1f1665cd) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) - + ''' # --- From 8dd43d8ba8d5c169ca32fce7852d2514199ed470 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 21:41:26 +0100 Subject: [PATCH 47/73] fix: refine impl --- pyproject.toml | 1 + src/metaxy/models/feature_spec.py | 15 +++++++++++++++ uv.lock | 21 +++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7161c27f8..53b6623db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ requires-python = ">=3.10" dependencies = [ "cyclopts==4.0.0b1", + "frozendict>=2.4.4", "narwhals>=2.9.0", "polars>=1.33.1", "polars-hash>=0.5.1", diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 138f20043..0c01b6ffa 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -19,6 +19,7 @@ ) import pydantic +from frozendict import frozendict from pydantic import BeforeValidator from pydantic.types import JsonValue from typing_extensions import Self @@ -200,6 +201,19 @@ def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.feature.table_name + +def _freeze_metadata(value: Any) -> Any: + """Recursively freeze metadata containers to enforce immutability.""" + if isinstance(value, frozendict): + return value + if isinstance(value, Mapping): + return frozendict({k: _freeze_metadata(v) for k, v in value.items()}) + if isinstance(value, list): + return tuple(_freeze_metadata(v) for v in value) + if isinstance(value, tuple): + return tuple(_freeze_metadata(v) for v in value) + return value + IDColumns: TypeAlias = Sequence[ str @@ -380,6 +394,7 @@ def validate_metadata_json_serializable(self) -> "FeatureSpec": f"metadata must be JSON-serializable. " f"Found non-serializable value: {e}" ) from e + object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) return self diff --git a/uv.lock b/uv.lock index c46262fbd..26f464e48 100644 --- a/uv.lock +++ b/uv.lock @@ -518,6 +518,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "frozendict" +version = "2.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/59/19eb300ba28e7547538bdf603f1c6c34793240a90e1a7b61b65d8517e35e/frozendict-2.4.6.tar.gz", hash = "sha256:df7cd16470fbd26fc4969a208efadc46319334eb97def1ddf48919b351192b8e", size = 316416, upload-time = "2024-10-13T12:15:32.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/7f/e80cdbe0db930b2ba9d46ca35a41b0150156da16dfb79edcc05642690c3b/frozendict-2.4.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c3a05c0a50cab96b4bb0ea25aa752efbfceed5ccb24c007612bc63e51299336f", size = 37927, upload-time = "2024-10-13T12:14:17.927Z" }, + { url = "https://files.pythonhosted.org/packages/29/98/27e145ff7e8e63caa95fb8ee4fc56c68acb208bef01a89c3678a66f9a34d/frozendict-2.4.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5b94d5b07c00986f9e37a38dd83c13f5fe3bf3f1ccc8e88edea8fe15d6cd88c", size = 37945, upload-time = "2024-10-13T12:14:19.976Z" }, + { url = "https://files.pythonhosted.org/packages/ac/f1/a10be024a9d53441c997b3661ea80ecba6e3130adc53812a4b95b607cdd1/frozendict-2.4.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c789fd70879ccb6289a603cdebdc4953e7e5dea047d30c1b180529b28257b5", size = 117656, upload-time = "2024-10-13T12:14:22.038Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/34c760975e6f1cb4db59a990d58dcf22287e10241c851804670c74c6a27a/frozendict-2.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da6a10164c8a50b34b9ab508a9420df38f4edf286b9ca7b7df8a91767baecb34", size = 117444, upload-time = "2024-10-13T12:14:24.251Z" }, + { url = "https://files.pythonhosted.org/packages/62/dd/64bddd1ffa9617f50e7e63656b2a7ad7f0a46c86b5f4a3d2c714d0006277/frozendict-2.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9a8a43036754a941601635ea9c788ebd7a7efbed2becba01b54a887b41b175b9", size = 116801, upload-time = "2024-10-13T12:14:26.518Z" }, + { url = "https://files.pythonhosted.org/packages/45/ae/af06a8bde1947277aad895c2f26c3b8b8b6ee9c0c2ad988fb58a9d1dde3f/frozendict-2.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9905dcf7aa659e6a11b8051114c9fa76dfde3a6e50e6dc129d5aece75b449a2", size = 117329, upload-time = "2024-10-13T12:14:28.485Z" }, + { url = "https://files.pythonhosted.org/packages/d2/df/be3fa0457ff661301228f4c59c630699568c8ed9b5480f113b3eea7d0cb3/frozendict-2.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:323f1b674a2cc18f86ab81698e22aba8145d7a755e0ac2cccf142ee2db58620d", size = 37522, upload-time = "2024-10-13T12:14:30.418Z" }, + { url = "https://files.pythonhosted.org/packages/4a/6f/c22e0266b4c85f58b4613fec024e040e93753880527bf92b0c1bc228c27c/frozendict-2.4.6-cp310-cp310-win_arm64.whl", hash = "sha256:eabd21d8e5db0c58b60d26b4bb9839cac13132e88277e1376970172a85ee04b3", size = 34056, upload-time = "2024-10-13T12:14:31.757Z" }, + { url = "https://files.pythonhosted.org/packages/04/13/d9839089b900fa7b479cce495d62110cddc4bd5630a04d8469916c0e79c5/frozendict-2.4.6-py311-none-any.whl", hash = "sha256:d065db6a44db2e2375c23eac816f1a022feb2fa98cbb50df44a9e83700accbea", size = 16148, upload-time = "2024-10-13T12:15:26.839Z" }, + { url = "https://files.pythonhosted.org/packages/ba/d0/d482c39cee2ab2978a892558cf130681d4574ea208e162da8958b31e9250/frozendict-2.4.6-py312-none-any.whl", hash = "sha256:49344abe90fb75f0f9fdefe6d4ef6d4894e640fadab71f11009d52ad97f370b9", size = 16146, upload-time = "2024-10-13T12:15:28.16Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/b6bf6a0de482d7d7d7a2aaac8fdc4a4d0bb24a809f5ddd422aa7060eb3d2/frozendict-2.4.6-py313-none-any.whl", hash = "sha256:7134a2bb95d4a16556bb5f2b9736dceb6ea848fa5b6f3f6c2d6dba93b44b4757", size = 16146, upload-time = "2024-10-13T12:15:29.495Z" }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -999,6 +1018,7 @@ version = "0.0.0" source = { editable = "." } dependencies = [ { name = "cyclopts" }, + { name = "frozendict" }, { name = "narwhals" }, { name = "polars" }, { name = "polars-hash" }, @@ -1064,6 +1084,7 @@ docs = [ [package.metadata] requires-dist = [ { name = "cyclopts", git = "https://github.com/BrianPugh/cyclopts.git?branch=mkdocs-plugin" }, + { name = "frozendict", specifier = ">=2.4.4" }, { name = "ibis-framework", marker = "extra == 'ibis'", specifier = ">=11.0.0" }, { name = "mermaid-py", marker = "extra == 'mermaid'", specifier = ">=0.8.0" }, { name = "narwhals", specifier = ">=2.9.0" }, From 36a2b2ef3e4475edb39cecdffb33a94c3064c8f9 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 07:58:19 +0100 Subject: [PATCH 48/73] fix: cleanup after rebase --- src/metaxy/models/feature.py | 6 ++++++ src/metaxy/models/feature_spec.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..756f8cbab 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -827,6 +827,12 @@ class BaseFeature( graph: ClassVar[FeatureGraph] project: ClassVar[str] code_version: ClassVar[str] = _CodeVersionDescriptor() # pyright: ignore[reportAssignmentType] + """Hash of this feature's field code versions only (excludes dependencies). + + Useful for detecting when the feature's own logic changes while remaining stable + if only upstream dependencies mutate; contrast with feature_version(), which + includes dependency hashes. + """ # once ClassVar supports it # this should be replaced by diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 0c01b6ffa..4b2d2ead6 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -201,7 +201,7 @@ def table_name(self) -> str: """Get SQL-like table name for this feature spec.""" return self.feature.table_name - + def _freeze_metadata(value: Any) -> Any: """Recursively freeze metadata containers to enforce immutability.""" if isinstance(value, frozendict): From 824682f4ea69cb0750811d01d096d1258274f935 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:24:47 +0100 Subject: [PATCH 49/73] chore: remove diff --- src/metaxy/models/feature.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index 756f8cbab..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -827,12 +827,6 @@ class BaseFeature( graph: ClassVar[FeatureGraph] project: ClassVar[str] code_version: ClassVar[str] = _CodeVersionDescriptor() # pyright: ignore[reportAssignmentType] - """Hash of this feature's field code versions only (excludes dependencies). - - Useful for detecting when the feature's own logic changes while remaining stable - if only upstream dependencies mutate; contrast with feature_version(), which - includes dependency hashes. - """ # once ClassVar supports it # this should be replaced by From ad65a96ed667d8f10e9606d8249e2d005986fbd4 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:34:06 +0100 Subject: [PATCH 50/73] fix: address reviewers comments --- src/metaxy/models/feature_spec.py | 50 ++++++++++++++++++++++++------- tests/test_feature_metadata.py | 2 ++ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 4b2d2ead6..d6a1ef0eb 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,6 +4,7 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property +from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -202,12 +203,19 @@ def table_name(self) -> str: return self.feature.table_name +IDColumns: TypeAlias = Sequence[ + str +] # non-bound, should be used for feature specs with arbitrary id columns +IDColumnsT = TypeVar( + "IDColumnsT", bound=IDColumns, covariant=True +) # bound, should be used for generic + + def _freeze_metadata(value: Any) -> Any: - """Recursively freeze metadata containers to enforce immutability.""" - if isinstance(value, frozendict): - return value + """Recursively convert metadata to immutable containers.""" if isinstance(value, Mapping): - return frozendict({k: _freeze_metadata(v) for k, v in value.items()}) + frozen_dict = {k: _freeze_metadata(v) for k, v in value.items()} + return MappingProxyType(frozen_dict) if isinstance(value, list): return tuple(_freeze_metadata(v) for v in value) if isinstance(value, tuple): @@ -215,12 +223,14 @@ def _freeze_metadata(value: Any) -> Any: return value -IDColumns: TypeAlias = Sequence[ - str -] # non-bound, should be used for feature specs with arbitrary id columns -IDColumnsT = TypeVar( - "IDColumnsT", bound=IDColumns, covariant=True -) # bound, should be used for generic +def _thaw_metadata(value: Any) -> Any: + if isinstance(value, MappingProxyType): + return {k: _thaw_metadata(v) for k, v in value.items()} + if isinstance(value, tuple): + return [_thaw_metadata(v) for v in value] + if isinstance(value, list): + return [_thaw_metadata(v) for v in value] + return value def _coerce_metadata(value: Any) -> dict[str, JsonValue] | None: @@ -264,6 +274,26 @@ class _BaseFeatureSpec(FrozenBaseModel): class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): id_columns: pydantic.SkipValidation[IDColumnsT] + @pydantic.model_validator(mode="before") + @classmethod + def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: + # Allow callers to omit metadata or pass None while keeping the field non-optional. + if "metadata" in values and values["metadata"] is None: + values.pop("metadata", None) + elif "metadata" in values: + try: + json.dumps(_thaw_metadata(values["metadata"])) + except (TypeError, ValueError) as exc: + raise ValueError( + "metadata must be JSON-serializable. Found non-serializable value" + ) from exc + return values + + @pydantic.model_validator(mode="after") + def _freeze_metadata_field(self) -> BaseFeatureSpec[IDColumnsT]: + object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) + return self + @overload def __init__( self, diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py index e9ecd18bb..47ed6d431 100644 --- a/tests/test_feature_metadata.py +++ b/tests/test_feature_metadata.py @@ -1,4 +1,6 @@ import json +from collections.abc import Sequence +from types import MappingProxyType import pytest From b51c9483474fb4d90e03deda29bfc4e8416d774c Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:46:23 +0100 Subject: [PATCH 51/73] fix: ensure FrozenBaseModel is used --- src/metaxy/models/feature_spec.py | 11 ++++------- tests/test_feature_metadata.py | 2 -- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index d6a1ef0eb..59e94c842 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,7 +4,6 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property -from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -281,19 +280,17 @@ def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: if "metadata" in values and values["metadata"] is None: values.pop("metadata", None) elif "metadata" in values: + metadata_value = values["metadata"] + if not isinstance(metadata_value, Mapping): + raise ValueError("metadata must be a mapping") try: - json.dumps(_thaw_metadata(values["metadata"])) + json.dumps(metadata_value) except (TypeError, ValueError) as exc: raise ValueError( "metadata must be JSON-serializable. Found non-serializable value" ) from exc return values - @pydantic.model_validator(mode="after") - def _freeze_metadata_field(self) -> BaseFeatureSpec[IDColumnsT]: - object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) - return self - @overload def __init__( self, diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py index 47ed6d431..e9ecd18bb 100644 --- a/tests/test_feature_metadata.py +++ b/tests/test_feature_metadata.py @@ -1,6 +1,4 @@ import json -from collections.abc import Sequence -from types import MappingProxyType import pytest From 19afc78ce395b11da5e25a1758e94629903f8429 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Thu, 30 Oct 2025 20:29:16 +0100 Subject: [PATCH 52/73] fix: refine implementation --- src/metaxy/models/feature.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index cf8087b3c..a52fe5985 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,6 +82,17 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) +class _CodeVersionDescriptor: + """Descriptor that returns field-only code version hashes.""" + + def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: + if owner.spec is None: + raise ValueError( + f"Feature '{owner.__name__}' has no spec; cannot compute code_version." + ) + return owner.spec.field_code_version_hash + + class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 925d7153a6c1741c700a6c5c7e7e8185cce93870 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 14:49:57 +0100 Subject: [PATCH 53/73] feat: re-enable type checker --- .github/workflows/QA.yml | 55 ++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index fb86cbbd7..2d50b7948 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -58,36 +58,31 @@ jobs: with: dprint-version: 0.50.2 - # typecheck: - # runs-on: depot-ubuntu-latest - # env: - # UV_PYTHON_PREFERENCE: only-system - # steps: - # - name: Checkout code - # uses: actions/checkout@v4 - # - name: Install Nix - # uses: cachix/install-nix-action@v27 - # with: - # nix_path: nixpkgs=channel:nixpkgs-unstable - # - name: Setup Magic Nix Cache - # uses: DeterminateSystems/magic-nix-cache-action@v8 - # - uses: nicknovitski/nix-develop@v1 - # - name: Sync dependencies - # run: uv python pin 3.10 && uv sync --all-extras --all-groups - # - name: Replace bundled Node.js with Nix Node.js - # run: | - # # Find the bundled node binary and replace it with Nix's node - # BUNDLED_NODE=$(find .venv/lib/python3.10/site-packages/nodejs_wheel/bin -name "node" -type f 2>/dev/null || true) - # if [ -n "$BUNDLED_NODE" ]; then - # rm -f "$BUNDLED_NODE" - # ln -s "$(which node)" "$BUNDLED_NODE" - # echo "Replaced bundled node with Nix node: $(which node)" - # fi - # - name: Run Basedpyright - # run: uv run basedpyright --level error - # - name: Cleanup nix environment - # if: always() - # run: bash .github/scripts/cleanup-nix-env.sh + typecheck: + runs-on: depot-ubuntu-latest + env: + UV_PYTHON_PREFERENCE: only-system + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Save original environment + run: bash .github/scripts/save-env.sh + - name: Install Nix + uses: cachix/install-nix-action@v27 + with: + nix_path: nixpkgs=channel:nixpkgs-unstable + - name: Setup Magic Nix Cache + uses: DeterminateSystems/magic-nix-cache-action@v8 + - uses: nicknovitski/nix-develop@v1 + - name: Sync dependencies + run: uv python pin 3.10 && uv sync --all-extras --all-groups + - name: Restore original environment + run: bash .github/scripts/restore-env.sh + - name: Run Basedpyright + run: uv run basedpyright --level error + - name: Cleanup nix environment + if: always() + run: bash .github/scripts/cleanup-nix-env.sh test: needs: filter From df6523e7e7cf7ff625db41d26fe8aa5c9292e191 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 15:02:49 +0100 Subject: [PATCH 54/73] chore: reduce permissions --- .github/workflows/QA.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 2d50b7948..dcaaab9ee 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -194,6 +194,8 @@ jobs: check: if: always() runs-on: depot-ubuntu-latest + permissions: + contents: read needs: - test - lint From 0a2ab790da55f71272dd4f90346026ae18cf9ce5 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 16:27:30 +0100 Subject: [PATCH 55/73] fix: cleanup --- src/metaxy/models/feature.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/metaxy/models/feature.py b/src/metaxy/models/feature.py index a52fe5985..cf8087b3c 100644 --- a/src/metaxy/models/feature.py +++ b/src/metaxy/models/feature.py @@ -82,17 +82,6 @@ def get_feature_by_key(key: "FeatureKey") -> type["BaseFeature[IDColumns]"]: return graph.get_feature_by_key(key) -class _CodeVersionDescriptor: - """Descriptor that returns field-only code version hashes.""" - - def __get__(self, instance: "Feature | None", owner: type["Feature"]) -> str: - if owner.spec is None: - raise ValueError( - f"Feature '{owner.__name__}' has no spec; cannot compute code_version." - ) - return owner.spec.field_code_version_hash - - class FeatureGraph: def __init__(self): self.features_by_key: dict[FeatureKey, type[BaseFeature[Any]]] = {} From 4d5ef8bb9d94472082a05e579d054df0a85ce133 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 16:37:28 +0100 Subject: [PATCH 56/73] fix: ts? --- .github/workflows/QA.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index dcaaab9ee..cc47ded1b 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -74,6 +74,9 @@ jobs: - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 - uses: nicknovitski/nix-develop@v1 + with: + # use devshell for Python 3.10 + arguments: "'.#\"python3.10\"'" - name: Sync dependencies run: uv python pin 3.10 && uv sync --all-extras --all-groups - name: Restore original environment From 3de5948c617f1eff369bb89f71ae8fac3bbf66f6 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 16:40:48 +0100 Subject: [PATCH 57/73] fix: permissions --- .github/workflows/QA.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index cc47ded1b..b52deeb10 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -62,6 +62,8 @@ jobs: runs-on: depot-ubuntu-latest env: UV_PYTHON_PREFERENCE: only-system + permissions: + contents: read steps: - name: Checkout code uses: actions/checkout@v4 From e88bac1392923a8331065c314a1e7851c6a034cb Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 16:47:57 +0100 Subject: [PATCH 58/73] fix: permissions --- .github/workflows/QA.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index b52deeb10..93b0b3684 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -64,6 +64,7 @@ jobs: UV_PYTHON_PREFERENCE: only-system permissions: contents: read + actions: write steps: - name: Checkout code uses: actions/checkout@v4 From 64e360aed3b7e29131cc58015700572ad513a563 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 17:00:18 +0100 Subject: [PATCH 59/73] fix: ? --- .github/workflows/QA.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 93b0b3684..f944cc298 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -76,6 +76,8 @@ jobs: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 + with: + cache-write: false - uses: nicknovitski/nix-develop@v1 with: # use devshell for Python 3.10 @@ -115,6 +117,8 @@ jobs: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 + with: + cache-write: false - uses: nicknovitski/nix-develop@v1 with: # use devshell for Python 3.10 From 5e1991669786cb619fb962e65f3a9c56a0820720 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 17:04:34 +0100 Subject: [PATCH 60/73] fix: disable cache one more --- .github/workflows/QA.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index f944cc298..3aa65194c 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -77,7 +77,7 @@ jobs: - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 with: - cache-write: false + use-gha-cache: false - uses: nicknovitski/nix-develop@v1 with: # use devshell for Python 3.10 @@ -118,7 +118,7 @@ jobs: - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 with: - cache-write: false + use-gha-cache: false - uses: nicknovitski/nix-develop@v1 with: # use devshell for Python 3.10 From 976a39174e9f3d5f1031520aef64206acf7fb0d2 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:20:26 +0100 Subject: [PATCH 61/73] fix: swap order --- .github/workflows/QA.yml | 24 ++++++++++++------------ src/metaxy/models/feature_spec.py | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 3aa65194c..e217d2dbc 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -79,18 +79,18 @@ jobs: with: use-gha-cache: false - uses: nicknovitski/nix-develop@v1 - with: - # use devshell for Python 3.10 - arguments: "'.#\"python3.10\"'" - - name: Sync dependencies - run: uv python pin 3.10 && uv sync --all-extras --all-groups - - name: Restore original environment - run: bash .github/scripts/restore-env.sh - - name: Run Basedpyright - run: uv run basedpyright --level error - - name: Cleanup nix environment - if: always() - run: bash .github/scripts/cleanup-nix-env.sh + - name: Run Typecheck in Nix Shell + run: | + nix develop '.#python3.10' -c bash <<'EOF' + # These commands run inside the temporary Nix shell + echo "--- Pinning Python and syncing dependencies ---" + uv python pin 3.10 + uv sync --all-extras --all-groups + + echo "--- Running Basedpyright ---" + uv run basedpyright --level error + EOF + test: needs: filter diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 59e94c842..61841632b 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,6 +4,7 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property +from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -19,7 +20,6 @@ ) import pydantic -from frozendict import frozendict from pydantic import BeforeValidator from pydantic.types import JsonValue from typing_extensions import Self @@ -399,7 +399,7 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: return self @pydantic.model_validator(mode="after") - def validate_metadata_json_serializable(self) -> "FeatureSpec": + def validate_metadata_json_serializable(self) -> Self: """Validate that metadata is JSON-serializable. This ensures that metadata can be safely serialized for storage, From e97295ffb88ca7135570160fe35b8442cc5e47e9 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:25:37 +0100 Subject: [PATCH 62/73] fix: simplify further; still nix challenges --- .github/workflows/QA.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index e217d2dbc..2ca9e8392 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -68,8 +68,6 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 - - name: Save original environment - run: bash .github/scripts/save-env.sh - name: Install Nix uses: cachix/install-nix-action@v27 with: @@ -78,7 +76,7 @@ jobs: uses: DeterminateSystems/magic-nix-cache-action@v8 with: use-gha-cache: false - - uses: nicknovitski/nix-develop@v1 + #- uses: nicknovitski/nix-develop@v1 - name: Run Typecheck in Nix Shell run: | nix develop '.#python3.10' -c bash <<'EOF' From a29eaab8971d6faea197c2ab87150ebab703afa4 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:28:45 +0100 Subject: [PATCH 63/73] fix: revert for test --- .github/workflows/QA.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 2ca9e8392..c74d28521 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -115,8 +115,6 @@ jobs: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 - with: - use-gha-cache: false - uses: nicknovitski/nix-develop@v1 with: # use devshell for Python 3.10 From 8dd2ce895154713a18cdd6fb12e5911be2bdb4a9 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:34:34 +0100 Subject: [PATCH 64/73] fix: different glibc nix --- .github/workflows/QA.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index c74d28521..c2d5bc8d9 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -69,14 +69,15 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - name: Install Nix - uses: cachix/install-nix-action@v27 + uses: cachix/install-nix-action@v31 with: - nix_path: nixpkgs=channel:nixpkgs-unstable + install_url: https://install.determinate.systems/nix + extra_nix_config: | + extra-experimental-features = nix-command flakes + #nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache uses: DeterminateSystems/magic-nix-cache-action@v8 - with: - use-gha-cache: false - #- uses: nicknovitski/nix-develop@v1 + - name: Run Typecheck in Nix Shell run: | nix develop '.#python3.10' -c bash <<'EOF' @@ -110,7 +111,7 @@ jobs: - name: Save original environment run: bash .github/scripts/save-env.sh - name: Install Nix - uses: cachix/install-nix-action@v27 + uses: cachix/install-nix-action@v31 with: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache @@ -181,7 +182,7 @@ jobs: - name: Save original environment run: bash .github/scripts/save-env.sh - name: Install Nix - uses: cachix/install-nix-action@v27 + uses: cachix/install-nix-action@v31 with: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache From 139658ddc89d18e952cbb1547430a68005a24bde Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:37:13 +0100 Subject: [PATCH 65/73] fix: k --- .github/workflows/QA.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index c2d5bc8d9..32c76abe5 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,7 +71,7 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - install_url: https://install.determinate.systems/nix + # install_url: https://install.determinate.systems/nix extra_nix_config: | extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable From aec40b420adfb90bc3aa7dc9a4ef4ca0fa4c456e Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:40:19 +0100 Subject: [PATCH 66/73] fix: non-interactive --- .github/workflows/QA.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 32c76abe5..d78ef2ef8 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,7 +71,8 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - # install_url: https://install.determinate.systems/nix + install_url: https://install.determinate.systems/nix + install_options: '--no-confirm' extra_nix_config: | extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable From f84414f21efb026962ff29ee61b79eb1ef67c50a Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:42:35 +0100 Subject: [PATCH 67/73] fix: use default shell --- .github/workflows/QA.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index d78ef2ef8..0811e4274 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -81,7 +81,7 @@ jobs: - name: Run Typecheck in Nix Shell run: | - nix develop '.#python3.10' -c bash <<'EOF' + nix develop . -c bash <<'EOF' # These commands run inside the temporary Nix shell echo "--- Pinning Python and syncing dependencies ---" uv python pin 3.10 From 18cfe6b6d58f00c6a069f6f376b81f016a398cdd Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:43:25 +0100 Subject: [PATCH 68/73] fix: use basic --- .github/workflows/QA.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 0811e4274..f3c620485 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,8 +71,8 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - install_url: https://install.determinate.systems/nix - install_options: '--no-confirm' + #install_url: https://install.determinate.systems/nix + #install_options: '--no-confirm' extra_nix_config: | extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable From c6d9fd39691d73e85777c1d3f63e93cf8fb7b4c4 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:49:55 +0100 Subject: [PATCH 69/73] fix: k --- .github/workflows/QA.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index f3c620485..a3aa17dbb 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,7 +71,7 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - #install_url: https://install.determinate.systems/nix + install_url: https://install.determinate.systems/nix #install_options: '--no-confirm' extra_nix_config: | extra-experimental-features = nix-command flakes From 1831819c1b46aab9b21c9751775654650ddbc7b1 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:54:07 +0100 Subject: [PATCH 70/73] fix: k --- .github/workflows/QA.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index a3aa17dbb..f8e1263f9 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -72,7 +72,7 @@ jobs: uses: cachix/install-nix-action@v31 with: install_url: https://install.determinate.systems/nix - #install_options: '--no-confirm' + install_options: --no-confirm extra_nix_config: | extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable From 7bf59d6b9f0dd30cb648b1002574daa3dae567c3 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:56:15 +0100 Subject: [PATCH 71/73] fix: k --- .github/workflows/QA.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index f8e1263f9..036b09ff7 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,8 +71,9 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - install_url: https://install.determinate.systems/nix - install_options: --no-confirm + #install_url: https://install.determinate.systems/nix + install_url: https://install.determinate.systems/nix?extra_conf=nix-command flakes&no-confirm + #install_options: --no-confirm extra_nix_config: | extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable From 4b5bab4c84a1e1a78795142d43540dc094b7f594 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 22:58:20 +0100 Subject: [PATCH 72/73] fix: k --- .github/workflows/QA.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 036b09ff7..969728230 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -71,10 +71,9 @@ jobs: - name: Install Nix uses: cachix/install-nix-action@v31 with: - #install_url: https://install.determinate.systems/nix - install_url: https://install.determinate.systems/nix?extra_conf=nix-command flakes&no-confirm - #install_options: --no-confirm + install_url: https://install.determinate.systems/nix extra_nix_config: | + auto-accept = true extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache From f5d9e1a7425870bc3f249fcda776c2e078d53c44 Mon Sep 17 00:00:00 2001 From: Georg Heiler Date: Fri, 31 Oct 2025 23:00:25 +0100 Subject: [PATCH 73/73] fix: k --- .github/workflows/QA.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 969728230..9246d1548 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -72,8 +72,8 @@ jobs: uses: cachix/install-nix-action@v31 with: install_url: https://install.determinate.systems/nix + install_options: "--no-confirm" extra_nix_config: | - auto-accept = true extra-experimental-features = nix-command flakes #nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache