Skip to content

Commit b6a3e34

Browse files
authored
Merge pull request #12 from spiraldb/mp/release-0.1.3
release 0.1.3: validate warns-not-raises; schema_hash prefix match
2 parents f67b90d + 0538274 commit b6a3e34

5 files changed

Lines changed: 92 additions & 11 deletions

File tree

CHANGELOG.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.3] - 2026-05-10
9+
10+
### Changed
11+
12+
- **Validate stage no longer hard-fails on row/schema_hash drift by
13+
default.** A mismatch now emits a `[WARN]` line to stderr and the build
14+
continues. Users invoking `python -m scripts.pipeline.build <slug>` have
15+
already opted into "fetch whatever is upstream now"; an upstream Arrow-
16+
conversion bump or a slightly-grown row count shouldn't turn that into a
17+
failed build. Pass `--strict` (new flag on `scripts.pipeline.build`) to
18+
upgrade warnings to errors — recommended for CI / pre-release gates.
19+
- The previous `--loose` flag has been removed; its behaviour (warn, don't
20+
raise) is now the default. Migrate `--loose` invocations to dropping the
21+
flag entirely; replace any "default-strict" CI invocations with
22+
`--strict`.
23+
24+
### Fixed
25+
26+
- **`validate.py` now compares `expect.schema_hash` as a prefix when the
27+
manifest value is shorter than the full 64-char SHA-256.** All 37 slugs
28+
with `schema_hash` set in `sources.json` use a 12-char short hash
29+
(matching the `[validate] schema_hash=` print convention, akin to git
30+
short SHAs); the previous full-string equality made every one of them
31+
fail validation on rebuild. Equal-length values still use strict
32+
equality, so full hashes remain enforceable for callers that prefer
33+
them.
34+
- `sources.schema.md` updated to document the prefix-match rule and the
35+
new warn-vs-`--strict` semantics for the `expect` block.
36+
837
## [0.1.2] - 2026-05-10
938

1039
### Fixed
@@ -106,6 +135,7 @@ This release bundles:
106135
this repository" button in the repo sidebar with BibTeX / APA / Chicago
107136
exports.
108137

138+
[0.1.3]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.3
109139
[0.1.2]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.2
110140
[0.1.1]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.1
111141
[0.1.0]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.0

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "raincloud"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files."
55
readme = "README.md"
66
requires-python = ">=3.11"

scripts/pipeline/build.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,17 @@
33
"""End-to-end orchestrator. Runs fetch → extract → parse → transform →
44
write → validate for one or more datasets selected from sources.json.
55
6+
By default the validate stage treats row/schema_hash drift as a warning
7+
(`[WARN]` to stderr) and continues — users invoking a build already opted
8+
into "download whatever's upstream now," so an upstream Arrow-conversion
9+
bump shouldn't brick their build. Pass `--strict` to upgrade those
10+
warnings to errors; that's the recommended setting for CI / pre-release
11+
gates where drift should block.
12+
613
Examples:
714
python -m scripts.pipeline.build clickbench-hits
815
python -m scripts.pipeline.build --family uci
9-
python -m scripts.pipeline.build --all --loose
16+
python -m scripts.pipeline.build --all --strict # CI mode
1017
"""
1118
from __future__ import annotations
1219

@@ -57,7 +64,9 @@ def main() -> int:
5764
ap.add_argument("slugs", nargs="*", help="specific slugs to build")
5865
ap.add_argument("--family", help="build all datasets in this family")
5966
ap.add_argument("--all", action="store_true", help="build every dataset")
60-
ap.add_argument("--loose", action="store_true", help="warn on validation failures instead of erroring")
67+
ap.add_argument("--strict", action="store_true",
68+
help="upgrade validate-stage drift warnings to hard errors "
69+
"(off by default; recommended for CI / pre-release gates)")
6170
ap.add_argument("--clean-workdir", action="store_true",
6271
help="after each successful build, remove _workdir/<slug>/ "
6372
"so large decompressed intermediates (e.g. Public BI bz2→csv) "
@@ -80,7 +89,7 @@ def main() -> int:
8089

8190
ok = failed = 0
8291
for spec in selected:
83-
if run_one(spec, strict=not args.loose, clean_workdir=args.clean_workdir):
92+
if run_one(spec, strict=args.strict, clean_workdir=args.clean_workdir):
8493
ok += 1
8594
else:
8695
failed += 1

scripts/pipeline/validate.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,45 @@
11
# SPDX-FileCopyrightText: 2026 Raincloud Maintainers
22
# SPDX-License-Identifier: Apache-2.0
3-
"""Stage 6 — compare actual parquet to the `expect` block."""
3+
"""Stage 6 — compare actual parquet to the `expect` block.
4+
5+
Drift is treated as a signal, not an error: a row count or schema_hash
6+
mismatch emits a `[WARN]` line but does NOT abort the build. The intent
7+
is that users who ran `build <slug>` already opted into "download
8+
whatever's currently upstream"; brittle equality checks against a
9+
manifest captured weeks ago shouldn't turn an HF Arrow-conversion bump
10+
into a failed build. Pass `strict=True` (CLI `--strict`) to opt back
11+
into hard failures — useful for CI / pre-release gates.
12+
13+
Schema-hash comparison is prefix-aware: `expect.schema_hash` may be the
14+
full 64-char SHA-256 or a short prefix (the manifest convention is 12
15+
chars, matching the `schema_hash=` line printed by this stage). Equal-
16+
length values use strict equality; a shorter expected acts as a prefix
17+
match on the computed hash.
18+
"""
419
from __future__ import annotations
520

621
import hashlib
22+
import sys
723
from pathlib import Path
824

925
import pyarrow.parquet as pq
1026

1127
from .spec import spec_field
1228

1329

14-
def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[dict]:
30+
def _schema_hash_matches(actual: str, expected: str | None) -> bool:
31+
"""Strict equality when lengths match; prefix match when expected is
32+
shorter. Manifest entries are typically the 12-char short form."""
33+
if expected is None:
34+
return True
35+
if len(expected) == len(actual):
36+
return actual == expected
37+
if len(expected) < len(actual):
38+
return actual.startswith(expected)
39+
return False
40+
41+
42+
def validate(spec: dict, written: list[Path], *, strict: bool = False) -> list[dict]:
1543
results = []
1644
for p in written:
1745
md = pq.ParquetFile(p).metadata
@@ -20,7 +48,7 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
2048
ok_rows = expected_rows is None or actual_rows == expected_rows
2149
schema_hash = _schema_hash(pq.ParquetFile(p).schema_arrow)
2250
expected_hash = spec_field(spec, "expect.schema_hash")
23-
ok_schema = expected_hash is None or schema_hash == expected_hash
51+
ok_schema = _schema_hash_matches(schema_hash, expected_hash)
2452
result = {
2553
"path": str(p),
2654
"rows_ok": ok_rows,
@@ -29,8 +57,18 @@ def validate(spec: dict, written: list[Path], *, strict: bool = True) -> list[di
2957
"schema_hash": schema_hash,
3058
}
3159
results.append(result)
32-
if strict and not (ok_rows and ok_schema):
33-
raise AssertionError(f"validation failed for {p}: {result}")
60+
if not (ok_rows and ok_schema):
61+
if not ok_rows:
62+
print(f"[WARN] {p.name}: rows drift "
63+
f"(expected={expected_rows:,} actual={actual_rows:,})",
64+
file=sys.stderr)
65+
if not ok_schema:
66+
exp_disp = expected_hash if expected_hash else "—"
67+
print(f"[WARN] {p.name}: schema_hash drift "
68+
f"(expected={exp_disp} actual={schema_hash[:12]})",
69+
file=sys.stderr)
70+
if strict:
71+
raise AssertionError(f"validation failed for {p}: {result}")
3472
print(f"[validate] {p.name} rows={actual_rows:,} "
3573
f"(expected={expected_rows}) schema_hash={schema_hash[:12]}")
3674
return results

sources.schema.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,12 @@ This document defines the shape of `sources.json`, the manifest that drives the
8383

8484
/* Stage 6 — validate (scripts/pipeline/validate.py) */
8585
"expect": {
86-
"rows": 99997497, // exact; pipeline errors on mismatch unless --loose
87-
"schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema
86+
"rows": 99997497, // exact; mismatch emits [WARN], does not fail unless --strict
87+
"schema_hash": null, // optional; SHA-256 of canonicalised Arrow schema.
88+
// May be the full 64-char hex or a leading prefix
89+
// (manifest convention is 12 chars, matching the
90+
// schema_hash= line printed by the validate stage).
91+
// Mismatch emits [WARN] only; pass --strict to fail.
8892
"notes": null
8993
},
9094

0 commit comments

Comments
 (0)