e-south · e-south · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -199,6 +199,8 @@ src/dnadesign/densegen/workspaces/**
 !src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/
 !src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/*.txt
 !src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/*.meme
+!src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/
+!src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/*.json
 
 # Legacy DenseGen runs (ignored to avoid local artifact noise)
 src/dnadesign/densegen/runs/**

diff --git a/.secrets.baseline b/.secrets.baseline
@@ -90,6 +90,10 @@
     {
       "path": "detect_secrets.filters.allowlist.is_line_allowlisted"
     },
+    {
+      "path": "detect_secrets.filters.common.is_baseline_file",
+      "filename": ".secrets.baseline"
+    },
     {
       "path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
       "min_level": 2
@@ -138,6 +142,38 @@
         "line_number": 181
       }
     ],
+    "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/cpxR__meme_suite_meme__cpxR_MANWWHTTTAM.json": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/cpxR__meme_suite_meme__cpxR_MANWWHTTTAM.json",
+        "hashed_secret": "2598c3ba7f3985f5df916954885b71931380e2ad",
+        "is_verified": false,
+        "line_number": 10
+      },
+      {
+        "type": "Hex High Entropy String",
+        "filename": "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/cpxR__meme_suite_meme__cpxR_MANWWHTTTAM.json",
+        "hashed_secret": "23616517bff0fc8f7749dc3f40e0ec36ec8ebcd1",
+        "is_verified": false,
+        "line_number": 11
+      }
+    ],
+    "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/lexA__meme_suite_meme__lexA_CTGTATAWAWWHACA.json": [
+      {
+        "type": "Hex High Entropy String",
+        "filename": "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/lexA__meme_suite_meme__lexA_CTGTATAWAWWHACA.json",
+        "hashed_secret": "733c5c02dcc073c2c1040be08dbb665375e48571",
+        "is_verified": false,
+        "line_number": 10
+      },
+      {
+        "type": "Hex High Entropy String",
+        "filename": "src/dnadesign/densegen/workspaces/demo_meme_two_tf/inputs/motif_artifacts/lexA__meme_suite_meme__lexA_CTGTATAWAWWHACA.json",
+        "hashed_secret": "b76157d075f0bf4ee272f029f598f911769f42d6",
+        "is_verified": false,
+        "line_number": 11
+      }
+    ],
     "src/dnadesign/opal/campaigns/demo/inputs/r0/demo_y_sfxi_existing.csv": [
       {
         "type": "AWS Access Key",
@@ -175,5 +211,5 @@
       }
     ]
   },
-  "generated_at": "2026-01-15T18:03:36Z"
+  "generated_at": "2026-01-24T01:02:10Z"
 }
diff --git a/docs/plans/2026-01-25-stage-a-strata-design.md b/docs/plans/2026-01-25-stage-a-strata-design.md
@@ -0,0 +1,72 @@
+# Stage-A PWM Sampling: Strata-First Semantics (FIMO)
+
+## Context
+Stage-A PWM sampling currently mixes the ideas of thresholding, binning, and selection in a way that exposes too many knobs and makes the logs hard to interpret. The desired behavior is: (1) mine PWM-like sequences, (2) account for a spectrum of p-value strata for diagnostics/visualization, and (3) retain only the best strata prefix for Stage-B. Configuration should be minimal and ergonomic, with a single obvious knob to adjust strictness, while still capturing per-bin distributions for later analyses (e.g., Hamming/Levenshtein by bin).
+
+## Goals
+- Align sampling semantics with user intent: generated → eligible → retained.
+- Keep configuration minimal and hard to misconfigure.
+- Preserve per-bin counts for didactic plots and diagnostics.
+- Make shortfalls expected and interpretable without extra debug logs.
+- Ensure docs, demo config, and tests align with the new semantics.
+
+## Non-goals
+- Automatic per-regulator threshold calibration.
+- Changing the FIMO backend itself or its internal scoring.
+- Adding new diversity-selection algorithms (post-hoc analysis stays separate).
+
+## Proposed Semantics
+We define three counts per regulator:
+- **Generated**: number of candidate sequences sampled.
+- **Eligible**: candidates with a FIMO hit at or below a floor threshold.
+- **Retained**: eligible hits within the best strata prefix, deduped and capped.
+
+FIMO only reports hits under its reporting threshold, so eligibility is defined by that floor. Per-bin counts are computed for eligible hits to support plots and later analysis. Retention is a strict prefix of bins (best p-values), not an arbitrary list of indices.
+
+## Config Changes (Breaking)
+Replace `pvalue_threshold` and `mining.retain_bin_ids` with two semantic knobs:
+- `pvalue_strata`: ordered p-value edges (best → worst). The **last** edge is the eligibility floor (FIMO `--thresh`).
+- `retain_depth`: number of best bins to keep for Stage-B (prefix of strata).
+
+`n_sites` remains as the **cap** on retained unique sites per regulator (not a target). The default behavior should be explicit in docs; a typical default is `pvalue_strata: [1e-8, 1e-6, 1e-4]` with `retain_depth: 2`.
+
+## Data Flow
+1. Generate candidate sequences as today.
+2. Run FIMO with `--thresh = last(pvalue_strata)`.
+3. Bin each reported hit by `pvalue_strata`.
+4. Accumulate **eligible** counts per bin (all bins up to the floor).
+5. Retain only bins `0..retain_depth-1`.
+6. Dedup retained sequences; if retained > `n_sites`, keep best by `(pvalue asc, score desc)`.
+
+This keeps accounting broad while retention remains strict and bounded.
+
+## Reporting & UX
+Stage-A recap table should show:
+- `candidates` = generated/target
+- `eligible` = eligible/generated
+- `pool` = retained/n_sites
+- `bins` = per-bin `eligible/retained` pairs (e.g., `b0 12/12 | b1 55/20 | b2 400/0`)
+- `len` = `n/min/med/avg/max` for retained (pool) sequences
+
+Zero-retained cases become interpretable without extra logs:
+- `eligible=0` → no hits under floor.
+- `eligible>0` but retained bins empty → hits exist, none in strict strata.
+
+## Migration
+- Remove `pvalue_threshold` and `mining.retain_bin_ids` from config schema.
+- Require `pvalue_strata` and `retain_depth` for FIMO inputs.
+- Update metadata fields to reflect `pvalue_strata` and `retain_depth`.
+- Update demo config and docs to use the new semantics.
+
+## Testing Plan
+- Config validation rejects legacy keys and enforces `pvalue_strata` + `retain_depth`.
+- Sampling tests verify:
+  - FIMO floor applied from last stratum edge.
+  - Eligibility counts include all bins up to floor.
+  - Retention is a prefix of bins (best strata).
+  - Dedup + cap are enforced on retained sites.
+- CLI recap tests verify new column labels and bin formatting.
+
+## Open Questions
+- Default `retain_depth` (require explicit vs. default to full strata).
+- Whether to surface the eligibility floor explicitly in metadata or derive from `pvalue_strata`.
diff --git a/pixi.toml b/pixi.toml
@@ -6,6 +6,8 @@ platforms = ["osx-arm64", "osx-64", "linux-64"]
 
 [tasks]
 cruncher = "uv run cruncher"
+dense = "uv run dense"
+pytest = "uv run pytest -q"
 
 [dependencies]
 meme = "*"
diff --git a/pyproject.toml b/pyproject.toml
@@ -100,6 +100,10 @@ where = ["src"]
 
 [tool.setuptools.package-data]
 "dnadesign.cruncher.ingest.certs" = ["*.pem"]
+"dnadesign.densegen" = [
+  "workspaces/demo_meme_two_tf/config.yaml",
+  "workspaces/demo_meme_two_tf/inputs/*.txt",
+]
 
 [tool.pytest.ini_options]
 addopts = "-ra -q"
@@ -109,6 +113,7 @@ norecursedirs = ["*/archived/*", ".venv", "venv", "build", "dist", "*.egg-info"]
 markers = ["slow: sampling-heavy tests (>10 s)"]
 filterwarnings = [
   "ignore:ArviZ is undergoing a major refactor.*:FutureWarning",
+  "ignore::FutureWarning:arviz.*",
   "ignore:builtin type SwigPyPacked has no __module__ attribute:DeprecationWarning",
   "ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning",
   "ignore:builtin type swigvarlink has no __module__ attribute:DeprecationWarning",

diff --git a/src/dnadesign/cruncher/README.md b/src/dnadesign/cruncher/README.md
@@ -20,6 +20,14 @@ A typical workflow looks like:
 3. Generate synthetic sequences (e.g., via [MCMC](https://en.wikipedia.org/wiki/Markov_chain_Monte_Carlo)) using the locked motifs.
 4. Analyze / visualize / report from run artifacts.
 
+Scoring is **FIMO-like**: cruncher builds log-odds PWMs against a 0‑order
+background, scans each candidate sequence to find the best window per TF
+(optionally bidirectional), and can scale that best hit to a p‑value using a
+DP‑derived null distribution (`score_scale: logp`). For `logp`, the tail
+probability for the best window is converted to a sequence‑level p via
+`p_seq = 1 − (1 − p_win)^n_windows`. This is an internal implementation; cruncher
+does not call the FIMO binary.
+
 ---
 
 ### Quickstart (happy path)

diff --git a/src/dnadesign/cruncher/docs/demos/demo_basics_two_tf.md b/src/dnadesign/cruncher/docs/demos/demo_basics_two_tf.md
@@ -4,6 +4,13 @@
 
 **cruncher** scores each TF by the best PWM match anywhere in the candidate sequence on either strand, then optimizes the min/soft‑min across TFs so the weakest TF improves. It explores sequence space with Gibbs + parallel tempering (MCMC) and returns a diverse elite set (unique up to reverse‑complement) plus diagnostics for stability/mixing. Motif overlap is allowed and treated as informative structure in analysis.
 
+Scoring is **FIMO-like** (internal implementation): for each PWM, cruncher builds
+log‑odds scores against a 0‑order background, scans all windows to find the best
+hit (optionally bidirectional), and optionally converts that best hit to a
+p‑value via a DP‑derived null distribution (`score_scale: logp`). For `logp`,
+the tail probability for the best window becomes a sequence‑level p via
+`p_seq = 1 − (1 − p_win)^n_windows`.
+
 **Terminology:**
 
 - **sites** = training binding sequences
@@ -476,12 +483,16 @@ Export the binding-site superset and the selected motifs for DenseGen runs:
 
 ```bash
 # Export binding sites (CSV/Parquet) for DenseGen binding_sites inputs
-cruncher catalog export-sites --set 1 --out /tmp/densegen_sites.csv -c "$CONFIG"
+cruncher catalog export-sites --set 1 --densegen-workspace demo_meme_two_tf -c "$CONFIG"
 
 # Export per-motif JSON artifacts for DenseGen PWM artifact inputs
-cruncher catalog export-densegen --set 1 --out /tmp/densegen_pwms -c "$CONFIG"
+cruncher catalog export-densegen --set 1 --densegen-workspace demo_meme_two_tf -c "$CONFIG"
 ```
 
+`--densegen-workspace` accepts a workspace name (resolved under `src/dnadesign/densegen/workspaces`)
+or an absolute path, and writes under that workspace's `inputs/`. You can still provide `--out`,
+but the path must remain inside the target `inputs/` directory.
+
 Then point DenseGen configs at the exported files (`type: binding_sites`) or artifacts
 (`type: pwm_artifact_set`).
 

diff --git a/src/dnadesign/cruncher/docs/demos/demo_campaigns_multi_tf.md b/src/dnadesign/cruncher/docs/demos/demo_campaigns_multi_tf.md
@@ -4,6 +4,11 @@
 
 This demo walks through a process of running category-based sequence optimization campaigns, with a focus on campaign selection (site counts + PWM quality), derived configs, and multi-TF runs.
 
+Scoring is **FIMO-like** (internal implementation): cruncher uses PWM log‑odds
+scanning against a 0‑order background, takes the best window per TF (optionally
+both strands), and can convert that best hit to a p‑value via a DP‑derived null
+distribution (`score_scale: logp`, with `p_seq = 1 − (1 − p_win)^n_windows`).
+
 ### Demo instance
 
 - **Workspace**: `src/dnadesign/cruncher/workspaces/demo_campaigns_multi_tf/`

diff --git a/src/dnadesign/cruncher/docs/reference/cli.md b/src/dnadesign/cruncher/docs/reference/cli.md
@@ -475,9 +475,15 @@ Examples:
 * `cruncher catalog pwms <config>`
 * `cruncher catalog pwms --set 1 <config>`
 * `cruncher catalog export-sites --set 1 --out densegen/sites.csv <config>`
+* `cruncher catalog export-sites --set 1 --densegen-workspace demo_meme_two_tf <config>`
 * `cruncher catalog export-densegen --set 1 --out densegen/pwms <config>`
+* `cruncher catalog export-densegen --set 1 --densegen-workspace demo_meme_two_tf <config>`
 * `cruncher catalog logos --set 1 <config>`
 
+`catalog export-densegen` and `catalog export-sites` accept `--densegen-workspace` (workspace
+name under `src/dnadesign/densegen/workspaces/` or an absolute path). When provided, outputs
+default to the workspace `inputs/` locations and must stay within that directory.
+
 ---
 
 #### `cruncher discover`

diff --git a/src/dnadesign/cruncher/docs/reference/config.md b/src/dnadesign/cruncher/docs/reference/config.md
@@ -385,6 +385,10 @@ Notes:
 - `objective.bidirectional=true` scores both strands (reverse complement) when scanning PWMs.
 - `objective.combine` controls how per-TF scores are combined (`min` for weakest-TF optimization, `sum` for sum-based).
 - `objective.allow_unscaled_llr=true` allows `score_scale=llr` in multi-TF runs (otherwise validation fails).
+- `objective.score_scale=logp` is FIMO‑like: it uses a DP‑derived null
+  distribution under a 0‑order background to compute a tail p‑value for the
+  best window, then converts to a sequence‑level p via
+  `p_seq = 1 − (1 − p_win)^n_windows` before reporting `−log10(p_seq)`.
 - `elites.min_hamming` is the Hamming-distance filter for elites (0 disables). If `output.trim.enabled=true` yields variable lengths, the distance is computed over the shared prefix plus the length difference.
 - `elites.k` controls how many sequences are retained before diversity filtering (0 = keep all).
 - `elites.dsDNA_canonicalize=true` treats reverse complements as identical when computing unique fractions and (optionally) stores `canonical_sequence` in elites.

diff --git a/src/dnadesign/cruncher/src/analysis/parquet.py b/src/dnadesign/cruncher/src/analysis/parquet.py
@@ -16,3 +16,7 @@ def read_parquet(path: Path):
     import pandas as pd
 
     return pd.read_parquet(path, engine="fastparquet")
+
+
+def write_parquet(df, path: Path) -> None:
+    df.to_parquet(path, engine="pyarrow", index=False)
diff --git a/src/dnadesign/cruncher/src/analysis/per_pwm.py b/src/dnadesign/cruncher/src/analysis/per_pwm.py
@@ -12,7 +12,7 @@
 
 import pandas as pd
 
-from dnadesign.cruncher.analysis.parquet import read_parquet
+from dnadesign.cruncher.analysis.parquet import read_parquet, write_parquet
 from dnadesign.cruncher.analysis.plots.scatter_utils import encode_sequence
 from dnadesign.cruncher.artifacts.layout import sequences_path
 from dnadesign.cruncher.core.scoring import Scorer
@@ -152,7 +152,7 @@ def gather_per_pwm_scores(
     out_df = out_df.sort_values(["chain", "draw"]).reset_index(drop=True)
     out_path.parent.mkdir(parents=True, exist_ok=True)
     if out_path.suffix == ".parquet":
-        out_df.to_parquet(out_path, engine="fastparquet", index=False)
+        write_parquet(out_df, out_path)
     else:
         out_df.to_csv(out_path, index=False)
     logger.info("Wrote change-threshold per-PWM scores → %s", out_path)
diff --git a/src/dnadesign/cruncher/src/analysis/plots/summary.py b/src/dnadesign/cruncher/src/analysis/plots/summary.py
@@ -18,7 +18,7 @@
 import pandas as pd
 import seaborn as sns
 
-from dnadesign.cruncher.analysis.parquet import read_parquet
+from dnadesign.cruncher.analysis.parquet import read_parquet, write_parquet
 from dnadesign.cruncher.analysis.plots._savefig import savefig
 
 logger = logging.getLogger(__name__)
@@ -49,7 +49,7 @@ def write_score_summary(score_df: pd.DataFrame, tf_names: list[str], out_path: P
     summary.reset_index(drop=True, inplace=True)
     out_path.parent.mkdir(parents=True, exist_ok=True)
     if out_path.suffix == ".parquet":
-        summary.to_parquet(out_path, engine="fastparquet", index=False)
+        write_parquet(summary, out_path)
     else:
         summary.to_csv(out_path, index=False)
 
@@ -71,7 +71,7 @@ def write_elite_topk(elites_df: pd.DataFrame, tf_names: list[str], out_path: Pat
     keep_cols = ["sequence"] + [c for c in ("rank", "norm_sum") if c in df.columns] + cols
     out_path.parent.mkdir(parents=True, exist_ok=True)
     if out_path.suffix == ".parquet":
-        df[keep_cols].to_parquet(out_path, engine="fastparquet", index=False)
+        write_parquet(df[keep_cols], out_path)
     else:
         df[keep_cols].to_csv(out_path, index=False)
 
@@ -120,7 +120,7 @@ def write_joint_metrics(elites_df: pd.DataFrame, tf_names: list[str], out_path:
         }
         df = pd.DataFrame([payload])
         if out_path.suffix == ".parquet":
-            df.to_parquet(out_path, engine="fastparquet", index=False)
+            write_parquet(df, out_path)
         else:
             df.to_csv(out_path, index=False)
         return
@@ -160,7 +160,7 @@ def write_joint_metrics(elites_df: pd.DataFrame, tf_names: list[str], out_path:
     }
     df = pd.DataFrame([payload])
     if out_path.suffix == ".parquet":
-        df.to_parquet(out_path, engine="fastparquet", index=False)
+        write_parquet(df, out_path)
     else:
         df.to_csv(out_path, index=False)
 

diff --git a/src/dnadesign/cruncher/src/app/analyze_workflow.py b/src/dnadesign/cruncher/src/app/analyze_workflow.py
@@ -34,7 +34,7 @@
 )
 from dnadesign.cruncher.analysis.objective import compute_objective_components
 from dnadesign.cruncher.analysis.overlap import compute_overlap_tables
-from dnadesign.cruncher.analysis.parquet import read_parquet
+from dnadesign.cruncher.analysis.parquet import read_parquet, write_parquet
 from dnadesign.cruncher.analysis.plot_registry import PLOT_SPECS
 from dnadesign.cruncher.analysis.report import ensure_report
 from dnadesign.cruncher.app.run_service import list_runs
@@ -977,8 +977,8 @@ def _plot_path(stem: str) -> Path:
         overlap_summary_path = tables_dir / f"overlap_summary.{table_ext}"
         elite_overlap_path = tables_dir / f"elite_overlap.{table_ext}"
         if table_ext == "parquet":
-            overlap_summary_df.to_parquet(overlap_summary_path, engine="fastparquet", index=False)
-            elite_overlap_df.to_parquet(elite_overlap_path, engine="fastparquet", index=False)
+            write_parquet(overlap_summary_df, overlap_summary_path)
+            write_parquet(elite_overlap_df, elite_overlap_path)
         else:
             overlap_summary_df.to_csv(overlap_summary_path, index=False)
             elite_overlap_df.to_csv(elite_overlap_path, index=False)
@@ -1041,13 +1041,13 @@ def _plot_path(stem: str) -> Path:
             if move_stats_summary_df is not None and not move_stats_summary_df.empty:
                 move_stats_summary_path = tables_dir / f"move_stats_summary.{table_ext}"
                 if table_ext == "parquet":
-                    move_stats_summary_df.to_parquet(move_stats_summary_path, engine="fastparquet", index=False)
+                    write_parquet(move_stats_summary_df, move_stats_summary_path)
                 else:
                     move_stats_summary_df.to_csv(move_stats_summary_path, index=False)
             if analysis_cfg.extra_tables and move_stats_df is not None:
                 move_stats_path = tables_dir / f"move_stats.{table_ext}"
                 if table_ext == "parquet":
-                    move_stats_df.to_parquet(move_stats_path, engine="fastparquet", index=False)
+                    write_parquet(move_stats_df, move_stats_path)
                 else:
                     move_stats_df.to_csv(move_stats_path, index=False)
 
@@ -1083,7 +1083,7 @@ def _plot_path(stem: str) -> Path:
                     pt_swap_pairs_df = pd.DataFrame(rows)
                     pt_swap_pairs_path = tables_dir / f"pt_swap_pairs.{table_ext}"
                     if table_ext == "parquet":
-                        pt_swap_pairs_df.to_parquet(pt_swap_pairs_path, engine="fastparquet", index=False)
+                        write_parquet(pt_swap_pairs_df, pt_swap_pairs_path)
                     else:
                         pt_swap_pairs_df.to_csv(pt_swap_pairs_path, index=False)
 
@@ -1097,7 +1097,7 @@ def _plot_path(stem: str) -> Path:
                     auto_opt_table_path = tables_dir / f"auto_opt_pilots.{table_ext}"
                     df_auto_table = pd.DataFrame(candidates)
                     if table_ext == "parquet":
-                        df_auto_table.to_parquet(auto_opt_table_path, engine="fastparquet", index=False)
+                        write_parquet(df_auto_table, auto_opt_table_path)
                     else:
                         df_auto_table.to_csv(auto_opt_table_path, index=False)
                 if analysis_cfg.extra_plots: