diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 831d4946f..9246d1548 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -37,7 +37,7 @@ jobs: runs-on: depot-ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install uv uses: astral-sh/setup-uv@v7 with: @@ -49,7 +49,7 @@ jobs: runs-on: depot-ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: Install uv uses: astral-sh/setup-uv@v7 with: @@ -58,36 +58,39 @@ jobs: with: dprint-version: 0.50.2 - # typecheck: - # runs-on: depot-ubuntu-latest - # env: - # UV_PYTHON_PREFERENCE: only-system - # steps: - # - name: Checkout code - # uses: actions/checkout@v4 - # - name: Install Nix - # uses: cachix/install-nix-action@v27 - # with: - # nix_path: nixpkgs=channel:nixpkgs-unstable - # - name: Setup Magic Nix Cache - # uses: DeterminateSystems/magic-nix-cache-action@v8 - # - uses: nicknovitski/nix-develop@v1 - # - name: Sync dependencies - # run: uv python pin 3.10 && uv sync --all-extras --all-groups - # - name: Replace bundled Node.js with Nix Node.js - # run: | - # # Find the bundled node binary and replace it with Nix's node - # BUNDLED_NODE=$(find .venv/lib/python3.10/site-packages/nodejs_wheel/bin -name "node" -type f 2>/dev/null || true) - # if [ -n "$BUNDLED_NODE" ]; then - # rm -f "$BUNDLED_NODE" - # ln -s "$(which node)" "$BUNDLED_NODE" - # echo "Replaced bundled node with Nix node: $(which node)" - # fi - # - name: Run Basedpyright - # run: uv run basedpyright --level error - # - name: Cleanup nix environment - # if: always() - # run: bash .github/scripts/cleanup-nix-env.sh + typecheck: + runs-on: depot-ubuntu-latest + env: + UV_PYTHON_PREFERENCE: only-system + permissions: + contents: read + actions: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Install Nix + uses: cachix/install-nix-action@v31 + with: + install_url: https://install.determinate.systems/nix + install_options: "--no-confirm" + extra_nix_config: | + extra-experimental-features = nix-command flakes + #nix_path: nixpkgs=channel:nixpkgs-unstable + - name: Setup Magic Nix Cache + uses: DeterminateSystems/magic-nix-cache-action@v8 + + - name: Run Typecheck in Nix Shell + run: | + nix develop . -c bash <<'EOF' + # These commands run inside the temporary Nix shell + echo "--- Pinning Python and syncing dependencies ---" + uv python pin 3.10 + uv sync --all-extras --all-groups + + echo "--- Running Basedpyright ---" + uv run basedpyright --level error + EOF + test: needs: filter @@ -109,7 +112,7 @@ jobs: - name: Save original environment run: bash .github/scripts/save-env.sh - name: Install Nix - uses: cachix/install-nix-action@v27 + uses: cachix/install-nix-action@v31 with: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache @@ -180,7 +183,7 @@ jobs: - name: Save original environment run: bash .github/scripts/save-env.sh - name: Install Nix - uses: cachix/install-nix-action@v27 + uses: cachix/install-nix-action@v31 with: nix_path: nixpkgs=channel:nixpkgs-unstable - name: Setup Magic Nix Cache @@ -199,6 +202,8 @@ jobs: check: if: always() runs-on: depot-ubuntu-latest + permissions: + contents: read needs: - test - lint diff --git a/CLAUDE.md b/CLAUDE.md index 735ecb60b..0c7f402d1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -270,6 +270,11 @@ Features can override `load_input()` for custom join logic: This is critical for migrations when upstream dependencies change. +#### Attaching Metadata to Features + +Additional metadata (JSON) can be attached to features via the `metadata` parameter on `FeatureSpec`. +Usecases may be for data governance such as ownership, SLAs, PII flags, ... etc. + ## Important Constraints ### Narwhals as the Public Interface diff --git a/docs/index.md b/docs/index.md index a86f345b6..8aa24d2fc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -106,6 +106,10 @@ When `Video` changes, Metaxy automatically identifies that `VoiceDetection` requ Every feature definition produces a deterministic version hash computed from its dependencies, fields, and code versions. When you modify a feature—whether changing its dependencies, adding fields, or updating transformation logic, Metaxy detects the change and propagates it downstream. This is done on multiple levels: `Feature` (class) level, field (class attribute) level, and of course on row level: each _sample_ in the metadata store tracks the version of _each field_ and the overall (class-level) feature version. +### Code vs Feature Versions + +`Feature.code_version` only looks at a feature's own fields. It hashes their keys and `code_version` values (in sorted order) and ignores the dependency graph entirely. Use it to answer _"did my feature's logic change?"_. In contrast, `Feature.feature_version()` includes both the local fields and every dependency, so it changes whenever parent features evolve. Checking both hashes lets you distinguish between local code updates and upstream changes. + This ensures that when feature definitions evolve, every feature that transitively depends on it can be systematically updated. Because Metaxy supports declaring dependencies on fields, it can identify when a feature _does not_ require recomputation, even if one of its parents has been changed (but only irrelevant fields did). This is a huge factor in improving efficiency and reducing unnecessary computations (and costs!). Because Metaxy feature graphs are static, Metaxy can calculate data version changes ahead of the actual computation. This enables patterns such as **computation preview** and **computation cost prediction**. diff --git a/docs/learn/feature-definitions.md b/docs/learn/feature-definitions.md index cdbab7ba9..b3fe54e35 100644 --- a/docs/learn/feature-definitions.md +++ b/docs/learn/feature-definitions.md @@ -1,23 +1,35 @@ # Feature System -Metaxy has a declarative (defined statically at class level), expressive, flexible feature system. It has been inspired by Software-Defined Assets in [Dagster](https://dagster.io/). +Metaxy has a declarative (defined statically at class level), expressive, flexible feature system. +It has been inspired by Software-Defined Assets in [Dagster](https://dagster.io/). -Features represent tabular **metadata**, typically containing references to external multi-modal **data** such as files, images, or videos. But it can be just pure **metadata** as well. +Features represent tabular **metadata**, typically containing references to external multi-modal **data** such as files, images, or videos. +But it can be just pure **metadata** as well. I will highlight **data** and **metadata** with bold so it really stands out. -Metaxy is responsible for providing correct **metadata** to users. During incremental processing, Metaxy will automatically resolve added, changed and deleted **metadata** rows and calculate the right [sample versions](data-versioning.md) for them. Metaxy does not interact with **data** directly, the user is responsible for writing it, typically using **metadata** to identify sample locations in storage (it's a good idea to inject the sample version into the data sample identifier). Metaxy is designed to be used with systems that do not overwrite existing **metadata** (Metaxy only appends **metadata**) and therefore **data** as well (while we cannot enforce that since the user is responsible for writing the data, it's easily achievable by **including the sample version into the data sample identifier**). +Metaxy is responsible for providing correct **metadata** to users. +During incremental processing, Metaxy will automatically resolve added, changed and deleted **metadata** rows and calculate the right [sample versions](data-versioning.md) for them. +Metaxy does not interact with **data** directly, the user is responsible for writing it, typically using **metadata** to identify sample locations in storage (it's a good idea to inject the sample version into the data sample identifier). +Metaxy is designed to be used with systems that do not overwrite existing **metadata** (Metaxy only appends **metadata**) and therefore **data** as well (while we cannot enforce that since the user is responsible for writing the data, it's easily achievable by **including the sample version into the data sample identifier**). I hope we can stop using bold for **data** and **metadata** from now on, hopefully we've made our point. -> [!tip] Include Sample Version In Your Data Path -> Include the sample version in your data path to ensure strong consistency guarantees. I mean it. Really do it! +> [!tip] Include sample version in your data path +> Include the sample version in your data path to ensure strong consistency guarantees. +> I mean it. +> Really do it! -Features live on a global `FeatureGraph` object (typically users do not need to interact with it directly). Features are bound to a specific Metaxy project, but can be moved between projects over time. Features must have unique (across all projects) `FeatureKey` associated with them. +Features live on a global `FeatureGraph` object (typically users do not need to interact with it directly). +Features are bound to a specific Metaxy project, but can be moved between projects over time. +Features must have unique (across all projects) `FeatureKey` associated with them. ## Feature Specs -Before we can define a `Feature`, we must first create a `FeatureSpec` object. But before we get to an example, it's necessary to understand the concept of ID columns. Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. Very often these ID columns would stay the same across many feature specs, therefore it makes a lot of sense to define them on a shared base class. +Before we can define a `Feature`, we must first create a `FeatureSpec` object. +But before we get to an example, it's necessary to understand the concept of ID columns. +Metaxy must know how to uniquely identify feature samples and join metadata tables, therefore, you need to attach one or more ID columns to your `FeatureSpec`. +Very often these ID columns would stay the same across many feature specs, therefore it makes a lot of sense to define them on a shared base class. Some boilerplate with typing is involved (this is typically a good thing): @@ -36,11 +48,17 @@ class VideoFeatureSpec(BaseFeatureSpec[VideoIds]): `BaseFeatureSpec` is a [Pydantic](https://docs.pydantic.dev/latest/) model, so all normal Pydantic features apply. +Feature specs now support an optional `metadata` dictionary for attaching ownership, documentation, or tooling context to a feature. +This metadata **never** influences graph topology or version hashes, must be JSON-serializable, and is immutable once the spec is created. +It is ideal for values such as owners, SLAs, runbooks, or tags that external systems may want to inspect. + With our `VideoFeatureSpec` in place, we can proceed to defining features that would be using it. ## Feature Definitions -Metaxy provides a `BaseFeature` class that can be extended to make user-defined features. It's a Pydantic model as well. User-defined `BaseFeature` classes must have fields matching ID columns of the `FeatureSpec` they are using. +Metaxy provides a `BaseFeature` class that can be extended to make user-defined features. +It's a Pydantic model as well. +User-defined `BaseFeature` classes must have fields matching ID columns of the `FeatureSpec` they are using. With respect to the same DRY principle, we can define a shared base class for features that use the `VideoFeatureSpec`. @@ -61,7 +79,9 @@ class VideoFeature(BaseVideoFeature, spec=VideoFeatureSpec(key="/raw/video")): path: str ``` -That's it! That's a roow feature, it doesn't have any dependencies. Easy. +That's it! +That's a raw single feature, it doesn't have any dependencies. +Easy. You may now use `VideoFeature.spec()` class method to access the original feature spec: it's bound to the class. @@ -81,17 +101,23 @@ Hurray! You get the idea. ## Field-Level Dependencies -A core (I'be straight: a killer) feature of Metaxy is the concept of **field-level dependencies**. These are used to define dependencies between logical fields of features. +A core (I'be straight: a killer) feature of Metaxy is the concept of **field-level dependencies**. +These are used to define dependencies between logical fields of features. -A **field** is not to be confused with metadata _column_ (Pydantic fields). Fields are completely independent from them. +A **field** is not to be confused with metadata _column_ (Pydantic fields). +Fields are completely independent from them. Columns refer to _metadata_ and are stored in metadata stores (such as databases) supported by Metaxy. -Fields refer to _data_ and are logical -- users are free to define them as they see fit. Fields are supposed to represent parts of data that users care about. For example, a `Video` feature -- an `.mp4` file -- may have `frames` and `audio` fields. +Fields refer to _data_ and are logical -- users are free to define them as they see fit. +Fields are supposed to represent parts of data that users care about. +For example, a `Video` feature -- an `.mp4` file -- may have `frames` and `audio` fields. -Downstream features can depend on specific fields of upstream features. This enables fine-grained control over data versioning, avoiding unnecessary reprocessing. +Downstream features can depend on specific fields of upstream features. +This enables fine-grained control over data versioning, avoiding unnecessary reprocessing. -At this point, careful readers have probably noticed that the `Transcript` feature from the [example](#feature-specs) above should not depend on the full video: it only needs the audio track in order to generate the transcript. Let's express that with Metaxy: +At this point, careful readers have probably noticed that the `Transcript` feature from the [example](#feature-specs) above should not depend on the full video: it only needs the audio track in order to generate the transcript. +Let's express that with Metaxy: ```py from metaxy import FieldDep, FieldSpec @@ -114,13 +140,16 @@ The [Data Versioning](data-versioning.md) docs explain more about this system. ### Fully Qualified Field Key -A **fully qualified field key (FQFK)** is an identifier that uniquely identifies a field within the whole feature graph. It consists of the **feature key** and the **field key**, separated by a colon, for example: `/raw/video:frames`, `/raw/video:audio/english`. +A **fully qualified field key (FQFK)** is an identifier that uniquely identifies a field within the whole feature graph. +It consists of the **feature key** and the **field key**, separated by a colon, for example: `/raw/video:frames`, `/raw/video:audio/english`. ## A Note on Type Coercion for Metaxy types Internally, Metaxy uses strongly typed Pydantic models to represent feature keys, their fields, and the dependencies between them. -To avoid boilerplate, Metaxy also has syntactic sugar for construction of these classes. Different ways to provide them are automatically coerced into canonical internal models. This is fully typed and only affects **constructor arguments**, so accessing **attributes** on Metaxy models will always return only the canonical types. +To avoid boilerplate, Metaxy also has syntactic sugar for construction of these classes. +Different ways to provide them are automatically coerced into canonical internal models. +This is fully typed and only affects **constructor arguments**, so accessing **attributes** on Metaxy models will always return only the canonical types. Some examples: @@ -133,7 +162,8 @@ key = FeatureKey("prefix", "feature") same_key = FeatureKey(key) ``` -Metaxy really loves you, the user! See [syntactic sugar](#syntactic-sugar) for more details. +Metaxy really loves you, the user! +See [syntactic sugar](#syntactic-sugar) for more details. ## Syntactic Sugar diff --git a/pyproject.toml b/pyproject.toml index 7161c27f8..53b6623db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ authors = [ requires-python = ">=3.10" dependencies = [ "cyclopts==4.0.0b1", + "frozendict>=2.4.4", "narwhals>=2.9.0", "polars>=1.33.1", "polars-hash>=0.5.1", diff --git a/src/metaxy/models/feature_spec.py b/src/metaxy/models/feature_spec.py index 4ac81e87e..61841632b 100644 --- a/src/metaxy/models/feature_spec.py +++ b/src/metaxy/models/feature_spec.py @@ -4,6 +4,7 @@ import json from collections.abc import Mapping, Sequence from functools import cached_property +from types import MappingProxyType from typing import ( TYPE_CHECKING, Annotated, @@ -13,14 +14,17 @@ Protocol, TypeAlias, TypeVar, + cast, overload, runtime_checkable, ) import pydantic from pydantic import BeforeValidator +from pydantic.types import JsonValue from typing_extensions import Self +from metaxy.models.bases import FrozenBaseModel from metaxy.models.field import FieldSpec, SpecialFieldDep from metaxy.models.types import ( CoercibleToFeatureKey, @@ -206,7 +210,49 @@ def table_name(self) -> str: ) # bound, should be used for generic -class _BaseFeatureSpec(pydantic.BaseModel): +def _freeze_metadata(value: Any) -> Any: + """Recursively convert metadata to immutable containers.""" + if isinstance(value, Mapping): + frozen_dict = {k: _freeze_metadata(v) for k, v in value.items()} + return MappingProxyType(frozen_dict) + if isinstance(value, list): + return tuple(_freeze_metadata(v) for v in value) + if isinstance(value, tuple): + return tuple(_freeze_metadata(v) for v in value) + return value + + +def _thaw_metadata(value: Any) -> Any: + if isinstance(value, MappingProxyType): + return {k: _thaw_metadata(v) for k, v in value.items()} + if isinstance(value, tuple): + return [_thaw_metadata(v) for v in value] + if isinstance(value, list): + return [_thaw_metadata(v) for v in value] + return value + + +def _coerce_metadata(value: Any) -> dict[str, JsonValue] | None: + if value is None: + return None + if not isinstance(value, Mapping): + raise ValueError("metadata must be a mapping") + try: + serialized = json.dumps(value) + except (TypeError, ValueError) as exc: + raise ValueError( + "metadata must be JSON-serializable. Found non-serializable value" + ) from exc + return cast(dict[str, JsonValue], json.loads(serialized)) + + +MetadataField = Annotated[ + dict[str, JsonValue] | None, + BeforeValidator(_coerce_metadata), +] + + +class _BaseFeatureSpec(FrozenBaseModel): key: Annotated[FeatureKey, BeforeValidator(FeatureKeyAdapter.validate_python)] deps: list[FeatureDep] | None = None fields: list[FieldSpec] = pydantic.Field( @@ -218,11 +264,33 @@ class _BaseFeatureSpec(pydantic.BaseModel): ) ] ) + metadata: MetadataField = pydantic.Field( + default=None, + description="Metadata attached to this feature.", + ) class BaseFeatureSpec(_BaseFeatureSpec, Generic[IDColumnsT]): id_columns: pydantic.SkipValidation[IDColumnsT] + @pydantic.model_validator(mode="before") + @classmethod + def _default_metadata(cls, values: dict[str, Any]) -> dict[str, Any]: + # Allow callers to omit metadata or pass None while keeping the field non-optional. + if "metadata" in values and values["metadata"] is None: + values.pop("metadata", None) + elif "metadata" in values: + metadata_value = values["metadata"] + if not isinstance(metadata_value, Mapping): + raise ValueError("metadata must be a mapping") + try: + json.dumps(metadata_value) + except (TypeError, ValueError) as exc: + raise ValueError( + "metadata must be JSON-serializable. Found non-serializable value" + ) from exc + return values + @overload def __init__( self, @@ -231,6 +299,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from string key.""" ... @@ -243,6 +312,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from sequence of parts.""" ... @@ -255,6 +325,7 @@ def __init__( deps: list[FeatureDep] | None = None, fields: list[FieldSpec] | None = None, id_columns: list[str] | None = None, + metadata: Mapping[str, Any] | JsonValue | None = None, ) -> None: """Initialize from FeatureKey instance.""" ... @@ -327,6 +398,33 @@ def validate_id_columns(self) -> BaseFeatureSpec[IDColumnsT]: ) return self + @pydantic.model_validator(mode="after") + def validate_metadata_json_serializable(self) -> Self: + """Validate that metadata is JSON-serializable. + + This ensures that metadata can be safely serialized for storage, + transmission, and graph snapshots. + + Note: Metadata is kept as a mutable dict for Pydantic serialization compatibility, + but users should treat it as immutable. The frozen FeatureSpec model prevents + reassignment of the metadata field itself. + + Raises: + ValueError: If metadata contains non-JSON-serializable types + """ + if self.metadata is not None: + try: + # Attempt to serialize and deserialize to validate + json.dumps(self.metadata) + except (TypeError, ValueError) as e: + raise ValueError( + f"metadata must be JSON-serializable. " + f"Found non-serializable value: {e}" + ) from e + object.__setattr__(self, "metadata", _freeze_metadata(self.metadata)) + + return self + @property def feature_spec_version(self) -> str: """Compute SHA256 hash of the complete feature specification. @@ -354,6 +452,8 @@ def feature_spec_version(self) -> str: # Use model_dump with mode="json" for deterministic serialization # This ensures all types (like FeatureKey) are properly serialized spec_dict = self.model_dump(mode="json") + if spec_dict.get("metadata") == {}: + spec_dict.pop("metadata", None) # Sort keys to ensure deterministic ordering spec_json = json.dumps(spec_dict, sort_keys=True) diff --git a/tests/__snapshots__/test_feature_project_detection.ambr b/tests/__snapshots__/test_feature_project_detection.ambr index 1cbe30459..1986f6fef 100644 --- a/tests/__snapshots__/test_feature_project_detection.ambr +++ b/tests/__snapshots__/test_feature_project_detection.ambr @@ -11,7 +11,7 @@ # name: test_feature_project_persists_across_graph_operations dict({ 'project': 'persist_project', - 'tracking_version': '6411d6d0', + 'tracking_version': '2a72c9e2', }) # --- # name: test_multiple_features_same_project diff --git a/tests/__snapshots__/test_feature_tracking_version.ambr b/tests/__snapshots__/test_feature_tracking_version.ambr index 49fb53488..f37043726 100644 --- a/tests/__snapshots__/test_feature_tracking_version.ambr +++ b/tests/__snapshots__/test_feature_tracking_version.ambr @@ -12,8 +12,8 @@ 'feature_versions_same': True, 'project_a': 'project_a', 'project_b': 'project_b', - 'tracking_version_a': 'fc7bde342b5583a99961c5c1a5c6188d362a691d86d6c61d0b92cb03115f6ff9', - 'tracking_version_b': 'd6b58547ece9d614fd6194165ab23277d4d7db76656709f260e4866f02f18cb5', + 'tracking_version_a': '7da02a89202f8eb9f19782512fadf988b815df8255e52a1f35dec9da76e18b12', + 'tracking_version_b': '5908b5019bdc1acd103482f1d144338f407f50d9d1aa09ca3808fe376f5ffd98', 'tracking_versions_differ': True, }) # --- @@ -43,5 +43,5 @@ }) # --- # name: test_tracking_version_deterministic - '16cd8a38139da8d77590ab9c4503b68c376bc83b9a7e38b9d793e23c7efd8172' + '139c7982886b761076245c878e6ad545f4afb9ffabab9238d7af883aff161b9b' # --- diff --git a/tests/__snapshots__/test_id_columns.ambr b/tests/__snapshots__/test_id_columns.ambr index d8afcaefa..21c11dbe0 100644 --- a/tests/__snapshots__/test_id_columns.ambr +++ b/tests/__snapshots__/test_id_columns.ambr @@ -1,8 +1,8 @@ # serializer version: 1 # name: test_snapshot_stability_with_id_columns dict({ - 'composite': 'fe41246bb432835bcfced693551b7bdb25800be120315ebe353a4fea51c43523', - 'default': '92cb08d343fc7b981059efceeb076d4c0e5d5b60e18e25860a24abf5c94ee320', - 'single_custom': 'd23b1833c20c5d9da30008c1655c0bb09f9ed2c8bab27e42ee63c2c5bfdf7a14', + 'composite': '8dc629cb2bc4bf404e15495f03c7efdcb339b96531fe6579d3f927a82589f843', + 'default': '0f5dac77217aa051fd504566de0fa3e1d8d4a360ed7958e716ff0f6ebb483a34', + 'single_custom': 'cfca9196fcb832b9da472042d9fd1ae5c485352d8e409d3d7c1f739880249c22', }) # --- diff --git a/tests/__snapshots__/test_spec_version.ambr b/tests/__snapshots__/test_spec_version.ambr index 013c3de5b..eba50e02c 100644 --- a/tests/__snapshots__/test_spec_version.ambr +++ b/tests/__snapshots__/test_spec_version.ambr @@ -1,14 +1,14 @@ # serializer version: 1 # name: test_feature_spec_version_deterministic - 'd58f485df70cf7afb2e0fc7e54eebf52234a1be0c22015ccd3239739077aae52' + 'b0741037ecd180b5948761b9d73b4dd78efa9cf7224240d3e3b718bae26e5852' # --- # name: test_feature_spec_version_includes_all_properties - '9421fb545f2ace1ffb1e07c2d14cc62427d0d261a3b509c8a1473f574efc8cce' + '8b54ae3cd476f3071a9437d4626658c15e40005380a0e2093585d442866e167b' # --- # name: test_feature_spec_version_recorded_in_metadata_store dict({ 'feature_key': 'recorded/feature', - 'feature_spec_version': '1103e6a29e9cc569cde596388b9da0961f2e308ae7d492013dce6bbf9566994c', + 'feature_spec_version': '39bcce7c7d9bc23f78982655717dc56879265b87304a96f371c7935df5c59d54', 'feature_version': '0c7b2d83252fbf2f689bec7d37c4f7ffb103755d11eafabc1e000a372f415d83', 'snapshot_version': '14d4294da40f4ab27ca85eb739fffd70e92c92522dbcba995cfa2aa343988bf3', }) @@ -34,16 +34,17 @@ 'snapshot', 'test', ]), + 'metadata': None, }), - 'feature_spec_version': '63d427712eb566d6c066c2514693714d82df604af91b48eae453634c4d684339', - 'feature_tracking_version': 'dabc7e8b54edb77992145840b2df9c81d4e384f3cdb1736bd3d5ecb83b0f422a', + 'feature_spec_version': 'ceaebb2a96f45752b2ca7daf8326b893d029cc56b366d7ee67bd4a3667008a03', + 'feature_tracking_version': '7dac08b916cbef01ec6971f5dd00a1c520c31ed6c09d1fd2223e82ed418546fb', 'feature_version': '7cfde77960e3cf10327e9cb97f311418b101fc5ee2c146922a997306e318edaf', 'project': 'test', }) # --- # name: test_feature_spec_version_with_column_selection_and_rename - '9bdc3eb37440620d259db2709010b3d920a7c3147ba30d370494cb3fecc787fc' + 'a412fd83810799e529173eed40fbcc29ed9adaed8c37ac4ffb21edd4f47c60d1' # --- # name: test_feature_spec_version_with_multiple_complex_deps - '3a24675ffa9a6a053dd6738cfe1706470cabc458ec5702d5837138a205bdf4fc' + '09d45d5f79ef381519aad8a30dae3c2bb253f8bc2e61c63114b8ac48fa8efcbc' # --- diff --git a/tests/examples/__snapshots__/test_recompute.ambr b/tests/examples/__snapshots__/test_recompute.ambr index 13f6cf1c9..0e4bdc1a3 100644 --- a/tests/examples/__snapshots__/test_recompute.ambr +++ b/tests/examples/__snapshots__/test_recompute.ambr @@ -2,18 +2,18 @@ # name: test_list_features[1] ''' --- - examples/parent (version + examples/parent (version 0aad9b8a2ea055cde3c4149fc4cde576e6478a982cea75b45c3cd012db43a5e8) Fields: - embeddings (code_version 1, version + embeddings (code_version 1, version 05e66510da58ef37168095b60188849cd6b1f0a4b539d0ac29ffd1e15b756459) --- - examples/child (version + examples/child (version 440ffb028aaa5cb21b155c4ef21debd81f283f99aa91ef58cbe541d71164b44f) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version 1905d9e85ceb58b0361e863c7fbfbf5843582f8ed135ea5ce6c24ee2c68c6bb4) ''' @@ -21,18 +21,18 @@ # name: test_list_features[2] ''' --- - examples/parent (version + examples/parent (version a007f308d0a852e3fbf80a442bd0089e29eed94efefe85231ef4fc927aa7d737) Fields: - embeddings (code_version 2, version + embeddings (code_version 2, version 3c8d3e9ba031ab3613eb4db0877d3959fce76d94d625335b89bae7fbd4f27add) --- - examples/child (version + examples/child (version 7251e21c32d2d8e35a8ba389a8ce1b597663f206dee0ff55e542a3af1f1665cd) Feature Dependencies: feature=examples/parent columns=None rename=None Fields: - predictions (code_version 1, version + predictions (code_version 1, version bcb950543ea50ba3e19aef4846cbc3d36725e2548040c4327219eb2e75e6d997) ''' diff --git a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr index ff3830bf3..d3eedb291 100644 --- a/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr +++ b/tests/metadata_stores/__snapshots__/test_snapshot_push.ambr @@ -29,19 +29,19 @@ 'versions': list([ dict({ 'feature_key': 'downstream', - 'feature_spec_version': 'c7e67b870148746be6b9caf37ea7fc6c46acb858c8ec31ff2f2961f0a5ac1754', + 'feature_spec_version': '20d33a795e3990ca63dbde8a5cd9ee3ab48738c4c1f6375cdcbc638202c505e5', 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), dict({ 'feature_key': 'upstream', - 'feature_spec_version': '66c1d90d5ff75442180f421208a4bfdf551d99b1814dbebddeed0a4a8b07423a', + 'feature_spec_version': 'a8e3517bbf15ca3ecc14d6300c3c78ace14765963993cfc5ffbee8512c6a7655', 'feature_version': '8a2ffeab8da447095c5ee7a77e5635a1e16e7f3605021732f50f7002fa258398', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), dict({ 'feature_key': 'downstream', - 'feature_spec_version': 'bd492837d4324369f47605ee647b7aaeb5db68ac240a85a40fbd9e59351afa2a', + 'feature_spec_version': '8ba901bb7efebf6b75ab81334237508b6e399f5515d818e3f76a4cd6421991a4', 'feature_version': '7b8e38a11f800fb93c18c0e04c4609d7b52ec12b62177ba8355531d363f01751', 'snapshot_version': '1ab4ecd9d7af535b843dc22416dd99d5e430317c0a2bcd01fe83ffc2d7392da8', }), diff --git a/tests/test_feature_metadata.py b/tests/test_feature_metadata.py new file mode 100644 index 000000000..e9ecd18bb --- /dev/null +++ b/tests/test_feature_metadata.py @@ -0,0 +1,84 @@ +import json + +import pytest + +from metaxy import Feature, FeatureKey, FeatureSpec +from metaxy.models.feature import FeatureGraph + + +def test_metadata_does_not_affect_version() -> None: + """Metadata differences should not change feature version hashes.""" + graph_a = FeatureGraph() + with graph_a.use(): + + class MetadataFeatureA( + Feature, + spec=FeatureSpec( + key=FeatureKey(["tests", "metadata", "same"]), + deps=None, + metadata={"owner": "team-a"}, + ), + ): + pass + + version_a = MetadataFeatureA.feature_version() + + graph_b = FeatureGraph() + with graph_b.use(): + + class MetadataFeatureB( + Feature, + spec=FeatureSpec( + key=FeatureKey(["tests", "metadata", "same"]), + deps=None, + metadata={"owner": "team-b"}, + ), + ): + pass + + version_b = MetadataFeatureB.feature_version() + + assert version_a == version_b + + +def test_metadata_json_serializable() -> None: + """Validate JSON serialization enforcement for metadata.""" + valid_metadata = { + "string": "value", + "number": 42, + "float": 3.14, + "bool": True, + "null": None, + "list": [1, 2, 3], + "nested": {"key": "value"}, + } + + spec = FeatureSpec( + key=FeatureKey(["tests", "metadata", "json"]), + deps=None, + metadata=valid_metadata, + ) + assert spec.metadata is not None + assert isinstance(spec.metadata, dict) + assert isinstance(spec.metadata["list"], list) + assert json.dumps(spec.metadata) is not None + + with pytest.raises(ValueError): + FeatureSpec( + key=FeatureKey(["tests", "metadata", "json"]), + deps=None, + metadata={"func": lambda x: x}, + ) + + +def test_metadata_immutable() -> None: + """Metadata mapping should be immutable after initialization.""" + spec = FeatureSpec( + key=FeatureKey(["tests", "metadata", "immutable"]), + deps=None, + metadata={"key": "value"}, + ) + assert spec.metadata is not None + + with pytest.raises(Exception): + spec.metadata = {"key": "new_value"} # type: ignore[assignment] diff --git a/uv.lock b/uv.lock index c46262fbd..26f464e48 100644 --- a/uv.lock +++ b/uv.lock @@ -518,6 +518,25 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "frozendict" +version = "2.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/59/19eb300ba28e7547538bdf603f1c6c34793240a90e1a7b61b65d8517e35e/frozendict-2.4.6.tar.gz", hash = "sha256:df7cd16470fbd26fc4969a208efadc46319334eb97def1ddf48919b351192b8e", size = 316416, upload-time = "2024-10-13T12:15:32.449Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a6/7f/e80cdbe0db930b2ba9d46ca35a41b0150156da16dfb79edcc05642690c3b/frozendict-2.4.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c3a05c0a50cab96b4bb0ea25aa752efbfceed5ccb24c007612bc63e51299336f", size = 37927, upload-time = "2024-10-13T12:14:17.927Z" }, + { url = "https://files.pythonhosted.org/packages/29/98/27e145ff7e8e63caa95fb8ee4fc56c68acb208bef01a89c3678a66f9a34d/frozendict-2.4.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f5b94d5b07c00986f9e37a38dd83c13f5fe3bf3f1ccc8e88edea8fe15d6cd88c", size = 37945, upload-time = "2024-10-13T12:14:19.976Z" }, + { url = "https://files.pythonhosted.org/packages/ac/f1/a10be024a9d53441c997b3661ea80ecba6e3130adc53812a4b95b607cdd1/frozendict-2.4.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c789fd70879ccb6289a603cdebdc4953e7e5dea047d30c1b180529b28257b5", size = 117656, upload-time = "2024-10-13T12:14:22.038Z" }, + { url = "https://files.pythonhosted.org/packages/46/a6/34c760975e6f1cb4db59a990d58dcf22287e10241c851804670c74c6a27a/frozendict-2.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da6a10164c8a50b34b9ab508a9420df38f4edf286b9ca7b7df8a91767baecb34", size = 117444, upload-time = "2024-10-13T12:14:24.251Z" }, + { url = "https://files.pythonhosted.org/packages/62/dd/64bddd1ffa9617f50e7e63656b2a7ad7f0a46c86b5f4a3d2c714d0006277/frozendict-2.4.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9a8a43036754a941601635ea9c788ebd7a7efbed2becba01b54a887b41b175b9", size = 116801, upload-time = "2024-10-13T12:14:26.518Z" }, + { url = "https://files.pythonhosted.org/packages/45/ae/af06a8bde1947277aad895c2f26c3b8b8b6ee9c0c2ad988fb58a9d1dde3f/frozendict-2.4.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c9905dcf7aa659e6a11b8051114c9fa76dfde3a6e50e6dc129d5aece75b449a2", size = 117329, upload-time = "2024-10-13T12:14:28.485Z" }, + { url = "https://files.pythonhosted.org/packages/d2/df/be3fa0457ff661301228f4c59c630699568c8ed9b5480f113b3eea7d0cb3/frozendict-2.4.6-cp310-cp310-win_amd64.whl", hash = "sha256:323f1b674a2cc18f86ab81698e22aba8145d7a755e0ac2cccf142ee2db58620d", size = 37522, upload-time = "2024-10-13T12:14:30.418Z" }, + { url = "https://files.pythonhosted.org/packages/4a/6f/c22e0266b4c85f58b4613fec024e040e93753880527bf92b0c1bc228c27c/frozendict-2.4.6-cp310-cp310-win_arm64.whl", hash = "sha256:eabd21d8e5db0c58b60d26b4bb9839cac13132e88277e1376970172a85ee04b3", size = 34056, upload-time = "2024-10-13T12:14:31.757Z" }, + { url = "https://files.pythonhosted.org/packages/04/13/d9839089b900fa7b479cce495d62110cddc4bd5630a04d8469916c0e79c5/frozendict-2.4.6-py311-none-any.whl", hash = "sha256:d065db6a44db2e2375c23eac816f1a022feb2fa98cbb50df44a9e83700accbea", size = 16148, upload-time = "2024-10-13T12:15:26.839Z" }, + { url = "https://files.pythonhosted.org/packages/ba/d0/d482c39cee2ab2978a892558cf130681d4574ea208e162da8958b31e9250/frozendict-2.4.6-py312-none-any.whl", hash = "sha256:49344abe90fb75f0f9fdefe6d4ef6d4894e640fadab71f11009d52ad97f370b9", size = 16146, upload-time = "2024-10-13T12:15:28.16Z" }, + { url = "https://files.pythonhosted.org/packages/a5/8e/b6bf6a0de482d7d7d7a2aaac8fdc4a4d0bb24a809f5ddd422aa7060eb3d2/frozendict-2.4.6-py313-none-any.whl", hash = "sha256:7134a2bb95d4a16556bb5f2b9736dceb6ea848fa5b6f3f6c2d6dba93b44b4757", size = 16146, upload-time = "2024-10-13T12:15:29.495Z" }, +] + [[package]] name = "ghp-import" version = "2.1.0" @@ -999,6 +1018,7 @@ version = "0.0.0" source = { editable = "." } dependencies = [ { name = "cyclopts" }, + { name = "frozendict" }, { name = "narwhals" }, { name = "polars" }, { name = "polars-hash" }, @@ -1064,6 +1084,7 @@ docs = [ [package.metadata] requires-dist = [ { name = "cyclopts", git = "https://github.com/BrianPugh/cyclopts.git?branch=mkdocs-plugin" }, + { name = "frozendict", specifier = ">=2.4.4" }, { name = "ibis-framework", marker = "extra == 'ibis'", specifier = ">=11.0.0" }, { name = "mermaid-py", marker = "extra == 'mermaid'", specifier = ">=0.8.0" }, { name = "narwhals", specifier = ">=2.9.0" },