11# SPDX-FileCopyrightText: 2026 Raincloud Maintainers
22# SPDX-License-Identifier: Apache-2.0
3- """Stage 6 — compare actual parquet to the `expect` block."""
3+ """Stage 6 — compare actual parquet to the `expect` block.
4+
5+ Drift is treated as a signal, not an error: a row count or schema_hash
6+ mismatch emits a `[WARN]` line but does NOT abort the build. The intent
7+ is that users who ran `build <slug>` already opted into "download
8+ whatever's currently upstream"; brittle equality checks against a
9+ manifest captured weeks ago shouldn't turn an HF Arrow-conversion bump
10+ into a failed build. Pass `strict=True` (CLI `--strict`) to opt back
11+ into hard failures — useful for CI / pre-release gates.
12+
13+ Schema-hash comparison is prefix-aware: `expect.schema_hash` may be the
14+ full 64-char SHA-256 or a short prefix (the manifest convention is 12
15+ chars, matching the `schema_hash=` line printed by this stage). Equal-
16+ length values use strict equality; a shorter expected acts as a prefix
17+ match on the computed hash.
18+ """
419from __future__ import annotations
520
621import hashlib
22+ import sys
723from pathlib import Path
824
925import pyarrow .parquet as pq
1026
1127from .spec import spec_field
1228
1329
14- def validate (spec : dict , written : list [Path ], * , strict : bool = True ) -> list [dict ]:
30+ def _schema_hash_matches (actual : str , expected : str | None ) -> bool :
31+ """Strict equality when lengths match; prefix match when expected is
32+ shorter. Manifest entries are typically the 12-char short form."""
33+ if expected is None :
34+ return True
35+ if len (expected ) == len (actual ):
36+ return actual == expected
37+ if len (expected ) < len (actual ):
38+ return actual .startswith (expected )
39+ return False
40+
41+
42+ def validate (spec : dict , written : list [Path ], * , strict : bool = False ) -> list [dict ]:
1543 results = []
1644 for p in written :
1745 md = pq .ParquetFile (p ).metadata
@@ -20,7 +48,7 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
2048 ok_rows = expected_rows is None or actual_rows == expected_rows
2149 schema_hash = _schema_hash (pq .ParquetFile (p ).schema_arrow )
2250 expected_hash = spec_field (spec , "expect.schema_hash" )
23- ok_schema = expected_hash is None or schema_hash == expected_hash
51+ ok_schema = _schema_hash_matches ( schema_hash , expected_hash )
2452 result = {
2553 "path" : str (p ),
2654 "rows_ok" : ok_rows ,
@@ -29,8 +57,18 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
2957 "schema_hash" : schema_hash ,
3058 }
3159 results .append (result )
32- if strict and not (ok_rows and ok_schema ):
33- raise AssertionError (f"validation failed for { p } : { result } " )
60+ if not (ok_rows and ok_schema ):
61+ if not ok_rows :
62+ print (f"[WARN] { p .name } : rows drift "
63+ f"(expected={ expected_rows :,} actual={ actual_rows :,} )" ,
64+ file = sys .stderr )
65+ if not ok_schema :
66+ exp_disp = expected_hash if expected_hash else "—"
67+ print (f"[WARN] { p .name } : schema_hash drift "
68+ f"(expected={ exp_disp } actual={ schema_hash [:12 ]} )" ,
69+ file = sys .stderr )
70+ if strict :
71+ raise AssertionError (f"validation failed for { p } : { result } " )
3472 print (f"[validate] { p .name } rows={ actual_rows :,} "
3573 f"(expected={ expected_rows } ) schema_hash={ schema_hash [:12 ]} " )
3674 return results
0 commit comments