harvardinformatics · tsackton · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/docs/setup.md b/docs/setup.md
@@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here
 ```{note}
 Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option.
 ```
-
diff --git a/tests/tests.py b/tests/tests.py
@@ -5,6 +5,7 @@
 import re
 import shutil
 import socket
+import subprocess
 import tempfile
 import threading
 from contextlib import contextmanager
@@ -266,6 +267,36 @@ def get_vcf_contig_headers(path):
     return contigs
 
 
+def extract_r_function_source(path, function_name):
+    """Extract an R function body from an Rmd file by balanced braces."""
+    text = Path(path).read_text()
+    marker = f"{function_name} <- function"
+    start = text.find(marker)
+    if start == -1:
+        raise AssertionError(f"Function '{function_name}' not found in {path}")
+
+    brace_start = text.find("{", start)
+    if brace_start == -1:
+        raise AssertionError(f"Could not find opening brace for '{function_name}' in {path}")
+
+    depth = 0
+    end = None
+    for idx in range(brace_start, len(text)):
+        char = text[idx]
+        if char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                end = idx + 1
+                break
+
+    if end is None:
+        raise AssertionError(f"Could not find closing brace for '{function_name}' in {path}")
+
+    return text[start:end]
+
+
 def write_reference_source_config(base_config, out_dir, *, source):
     """Write a config copy with reference.source overridden."""
     text = Path(base_config).read_text()
@@ -291,6 +322,16 @@ def write_gvcf_sample_sheet(out_dir, *, sample_id, gvcf_path):
     return out_path
 
 
+def write_fastq_sample_sheet(out_dir, *, sample_id, library_id):
+    """Write a one-row FASTQ sample sheet with explicit sample and library IDs."""
+    out_path = Path(out_dir) / "numeric_id_fastqs.csv"
+    out_path.write_text(
+        "sample_id,input_type,input,library_id,mark_duplicates\n"
+        f"{sample_id},fastq,tests/data/fastq/sample1_1.fastq.gz;tests/data/fastq/sample1_2.fastq.gz,{library_id},true\n"
+    )
+    return out_path
+
+
 @contextmanager
 def serve_directory(directory):
     """Serve a directory over HTTP for reference URL tests."""
@@ -698,7 +739,8 @@ def test_reference_url_sources(request, compressed):
 
 @pytest.mark.full_run
 @pytest.mark.parametrize("intervals_enabled", [False, True])
-def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled):
+@pytest.mark.parametrize("sample_id", ["sample_gvcf", "00123"])
+def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled, sample_id):
     no_conda = request.config.getoption("--no-conda")
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = Path(tmpdir)
@@ -715,7 +757,7 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
         cfg = write_intervals_config(cfg, tmpdir, enabled=intervals_enabled)
         samples = write_gvcf_sample_sheet(
             tmpdir,
-            sample_id="sample_gvcf",
+            sample_id=sample_id,
             gvcf_path=gvcf_path,
         )
 
@@ -727,7 +769,26 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
         result.assert_success()
 
         mapfile = (tmp_path / "results/genomics_db/mapfile.txt").read_text().strip()
-        assert mapfile == f"sample_gvcf\t{gvcf_path}"
+        assert mapfile == f"{sample_id}\t{gvcf_path}"
+
+
+@pytest.mark.dry_run
+def test_fastq_dry_run_accepts_numeric_like_sample_and_library_ids(request):
+    no_conda = request.config.getoption("--no-conda")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        smk = SnakemakeRunner(Path(tmpdir), use_conda=not no_conda)
+        samples = write_fastq_sample_sheet(tmpdir, sample_id="00123", library_id="456")
+
+        result = smk.dry_run(
+            target=[
+                "results/filtered_fastqs/00123/456/u1_1.fastq.gz",
+                "results/filtered_fastqs/00123/456/u1_2.fastq.gz",
+            ],
+            configfile=get_config_file(),
+            samples=samples,
+        )
+
+        result.assert_success()
 
 
 @pytest.mark.full_run
@@ -1365,6 +1426,50 @@ def test_write_numeric_qc_inputs_rewrites_contigs():
         assert fai_lines[:2] == ["1\t1\t3", "2\t1\t3"]
 
 
+def test_qc_dashboard_helper_preserves_numeric_like_ids():
+    """QC dashboard helper should preserve leading zeros for headered and headerless tables."""
+    if shutil.which("Rscript") is None:
+        pytest.skip("Rscript is not available")
+
+    helper_source = extract_r_function_source(
+        WORKFLOW_DIR / "modules" / "qc" / "scripts" / "qc_dashboard_interactive.Rmd",
+        "read_table_preserve_ids",
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_path = Path(tmpdir)
+        headerless = tmp_path / "dist.id"
+        headerless.write_text("0 000123\n0 000456\n")
+        headered = tmp_path / "depth.tsv"
+        headered.write_text("INDV\tMEAN_DEPTH\n000123\t5.2\n")
+
+        script = tmp_path / "validate_read_table_preserve_ids.R"
+        script.write_text(
+            "\n".join(
+                [
+                    "args <- commandArgs(trailingOnly = TRUE)",
+                    "headerless <- args[[1]]",
+                    "headered <- args[[2]]",
+                    helper_source,
+                    "headerless_df <- read_table_preserve_ids(headerless, id_cols = c('V1', 'V2'))",
+                    "if (!is.character(headerless_df$V1) || !is.character(headerless_df$V2)) stop('Headerless ID columns were not read as character')",
+                    "if (!identical(headerless_df$V2[[1]], '000123')) stop('Headerless leading zeros were not preserved')",
+                    "headered_df <- read_table_preserve_ids(headered, header = TRUE, sep = '\\t', id_cols = c('INDV'))",
+                    "if (!is.character(headered_df$INDV)) stop('Headered ID column was not read as character')",
+                    "if (!identical(headered_df$INDV[[1]], '000123')) stop('Headered leading zeros were not preserved')",
+                ]
+            )
+        )
+
+        result = subprocess.run(
+            ["Rscript", str(script), str(headerless), str(headered)],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert result.returncode == 0, (result.stdout + result.stderr).strip()
+
+
 @pytest.mark.full_run
 def test_qc_standalone_full_run(request):
     """Full execution of QC module as standalone workflow against test fixtures."""

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -375,4 +375,4 @@ def test_coverage_bed(request):
             fields = line.split("\t")
             assert len(fields) == 3, f"BED line should have 3 fields: {line}"
             assert fields[0] == "chr2l", f"Expected contig chr2l: {line}"
-            assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"
+            assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"
diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile
@@ -48,13 +48,23 @@ for _k, _v in _PP_DEFAULTS.items():
 
 # ---------- Sample sheet + metadata ------------------------------------------
 
-_samples_df = pd.read_csv(config["samples"])
+_SAMPLE_SHEET_DTYPES = {
+    "sample_id": "string",
+    "library_id": "string",
+}
+
+_SAMPLE_METADATA_DTYPES = {
+    "sample_id": "string",
+}
+
+
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
 _meta_path = config.get("sample_metadata", "")
 if _meta_path:
-    _metadata_df = pd.read_csv(_meta_path)
+    _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
 
 
 def _parse_bool(series, col_name):

diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile
@@ -48,13 +48,23 @@ for _k, _v in _QC_DEFAULTS.items():
 
 # ---------- Sample sheet + metadata ------------------------------------------
 
-_samples_df = pd.read_csv(config["samples"])
+_SAMPLE_SHEET_DTYPES = {
+    "sample_id": "string",
+    "library_id": "string",
+}
+
+_SAMPLE_METADATA_DTYPES = {
+    "sample_id": "string",
+}
+
+
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
 _meta_path = config.get("sample_metadata", "")
 if _meta_path:
-    _metadata_df = pd.read_csv(_meta_path)
+    _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
 
 
 def _has_coords():
Original file line number	Diff line number	Diff line change
Expand Up		@@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here
		```{note}
		Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option.
		```