From 214a1637fa31239808888d8c0ffc3f057a78aafa Mon Sep 17 00:00:00 2001
From: Tim Sackton <timsackton@gmail.com>
Date: Mon, 6 Apr 2026 20:10:07 -0400
Subject: [PATCH 1/6] Preserve numeric-like sample identifiers

---
 tests/tests.py                                | 36 +++++++-
 workflow/modules/postprocess/Snakefile        | 14 +++-
 workflow/modules/qc/Snakefile                 | 14 +++-
 .../qc/scripts/qc_dashboard_interactive.Rmd   | 82 ++++++++++++++-----
 workflow/rules/common.smk                     | 17 +++-
 5 files changed, 135 insertions(+), 28 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index f8dc3202..050bc15e 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -291,6 +291,16 @@ def write_gvcf_sample_sheet(out_dir, *, sample_id, gvcf_path):
     return out_path
 
 
+def write_fastq_sample_sheet(out_dir, *, sample_id, library_id):
+    """Write a one-row FASTQ sample sheet with explicit sample and library IDs."""
+    out_path = Path(out_dir) / "numeric_id_fastqs.csv"
+    out_path.write_text(
+        "sample_id,input_type,input,library_id,mark_duplicates\n"
+        f"{sample_id},fastq,tests/data/fastq/sample1_1.fastq.gz;tests/data/fastq/sample1_2.fastq.gz,{library_id},true\n"
+    )
+    return out_path
+
+
 @contextmanager
 def serve_directory(directory):
     """Serve a directory over HTTP for reference URL tests."""
@@ -698,7 +708,8 @@ def test_reference_url_sources(request, compressed):
 
 @pytest.mark.full_run
 @pytest.mark.parametrize("intervals_enabled", [False, True])
-def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled):
+@pytest.mark.parametrize("sample_id", ["sample_gvcf", "00123"])
+def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled, sample_id):
     no_conda = request.config.getoption("--no-conda")
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = Path(tmpdir)
@@ -715,7 +726,7 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
         cfg = write_intervals_config(cfg, tmpdir, enabled=intervals_enabled)
         samples = write_gvcf_sample_sheet(
             tmpdir,
-            sample_id="sample_gvcf",
+            sample_id=sample_id,
             gvcf_path=gvcf_path,
         )
 
@@ -727,7 +738,26 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
         result.assert_success()
 
         mapfile = (tmp_path / "results/genomics_db/mapfile.txt").read_text().strip()
-        assert mapfile == f"sample_gvcf\t{gvcf_path}"
+        assert mapfile == f"{sample_id}\t{gvcf_path}"
+
+
+@pytest.mark.dry_run
+def test_fastq_dry_run_accepts_numeric_like_sample_and_library_ids(request):
+    no_conda = request.config.getoption("--no-conda")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        smk = SnakemakeRunner(Path(tmpdir), use_conda=not no_conda)
+        samples = write_fastq_sample_sheet(tmpdir, sample_id="00123", library_id="456")
+
+        result = smk.dry_run(
+            target=[
+                "results/filtered_fastqs/00123/456/u1_1.fastq.gz",
+                "results/filtered_fastqs/00123/456/u1_2.fastq.gz",
+            ],
+            configfile=get_config_file(),
+            samples=samples,
+        )
+
+        result.assert_success()
 
 
 @pytest.mark.full_run
diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile
index abbc5067..f2a746ef 100644
--- a/workflow/modules/postprocess/Snakefile
+++ b/workflow/modules/postprocess/Snakefile
@@ -48,13 +48,23 @@ for _k, _v in _PP_DEFAULTS.items():
 
 # ---------- Sample sheet + metadata ------------------------------------------
 
-_samples_df = pd.read_csv(config["samples"])
+_SAMPLE_SHEET_DTYPES = {
+    "sample_id": "string",
+    "library_id": "string",
+}
+
+_SAMPLE_METADATA_DTYPES = {
+    "sample_id": "string",
+}
+
+
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
 _meta_path = config.get("sample_metadata", "")
 if _meta_path:
-    _metadata_df = pd.read_csv(_meta_path)
+    _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
 
 
 def _parse_bool(series, col_name):
diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile
index e7571163..61c3eff2 100644
--- a/workflow/modules/qc/Snakefile
+++ b/workflow/modules/qc/Snakefile
@@ -48,13 +48,23 @@ for _k, _v in _QC_DEFAULTS.items():
 
 # ---------- Sample sheet + metadata ------------------------------------------
 
-_samples_df = pd.read_csv(config["samples"])
+_SAMPLE_SHEET_DTYPES = {
+    "sample_id": "string",
+    "library_id": "string",
+}
+
+_SAMPLE_METADATA_DTYPES = {
+    "sample_id": "string",
+}
+
+
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
 _meta_path = config.get("sample_metadata", "")
 if _meta_path:
-    _metadata_df = pd.read_csv(_meta_path)
+    _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
 
 
 def _has_coords():
diff --git a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
index 5279f725..cdc35e6f 100755
--- a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
+++ b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
@@ -31,6 +31,50 @@ plink_prefix <- file.path(qc_dir, "plink")
 
 set.colors <- c('#1f78b4','#33a02c','#6a3d9a','#a6cee3','#b2df8a','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6')
 
+read_table_preserve_ids <- function(file = NULL, text = NULL, id_cols = character(), ...) {
+  if (!is.null(file) && !is.null(text)) {
+    stop("Provide either file or text, not both.")
+  }
+  if (is.null(file) && is.null(text)) {
+    stop("Provide file or text.")
+  }
+
+  first_line <- if (!is.null(file)) {
+    readLines(file, n = 1)
+  } else {
+    strsplit(text, "\n", fixed = TRUE)[[1]][1]
+  }
+
+  header_df <- read.table(
+    text = first_line,
+    header = TRUE,
+    check.names = FALSE,
+    stringsAsFactors = FALSE
+  )
+
+  col_classes <- rep(NA_character_, ncol(header_df))
+  names(col_classes) <- names(header_df)
+  col_classes[names(col_classes) %in% id_cols] <- "character"
+
+  if (!is.null(file)) {
+    read.table(
+      file,
+      colClasses = col_classes,
+      check.names = FALSE,
+      stringsAsFactors = FALSE,
+      ...
+    )
+  } else {
+    read.table(
+      text = text,
+      colClasses = col_classes,
+      check.names = FALSE,
+      stringsAsFactors = FALSE,
+      ...
+    )
+  }
+}
+
 ```
 
 
@@ -43,12 +87,12 @@ Row
 ```{r numsnps}
 #fai file for getting genome length
 fai_path <- file.path(qc_dir, "ref.fai")
-df_fai <- read.table(fai_path, header = F)
+df_fai <- read.table(fai_path, header = F, check.names = FALSE, stringsAsFactors = FALSE)
 genome_size <- sum(df_fai$V2)
 
 #summary of the unfiltered VCF
 sum_path <- paste0(vcf_prefix, ".FILTER.summary")
-df_sum <- read.table(sum_path, header =T)
+df_sum <- read.table(sum_path, header =T, check.names = FALSE, stringsAsFactors = FALSE)
 
 #total number of SNPs found by pipeline
 first_line <- df_sum %>% filter(FILTER == ".")
@@ -65,10 +109,10 @@ tot_snps <- snps_remain + fil_snps
 proj_name <- basename(qc_dir)
 
 #num SNPs in pruned data
-nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T))
+nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T, check.names = FALSE, stringsAsFactors = FALSE))
 
 #estimate watterson's theta:
-num.samples <- nrow(read.table(paste0(vcf_prefix, ".idepth"), header = T))
+num.samples <- nrow(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV")))
 
 #harmonic number calculation for n-1 chromosomes (2*sample size)
 Hn = 0
@@ -105,7 +149,7 @@ valueBox(div_text, icon = "fa-dna")
 ### Mean depth
 
 ```{r}
-mean.depth <- round(mean(read.table(paste0(vcf_prefix, ".idepth"), header = T)$MEAN_DEPTH), 1)
+mean.depth <- round(mean(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV"))$MEAN_DEPTH), 1)
 valueBox(mean.depth, icon = "fa-align-center")
 ```
 
@@ -138,14 +182,14 @@ input$GMKey <- params$GMKey
 pca.path <- paste0(plink_prefix, ".eigenvec")
 #this makes it reasonably robust to running with plink 1.9 or plink 2.0
 tmp.head <- sub("#", "", readLines(pca.path))
-df.pca <- read.table(text = tmp.head, header = TRUE)
-df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE)
+df.pca <- read_table_preserve_ids(text = tmp.head, header = TRUE, id_cols = c("FID", "IID"))
+df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE, check.names = FALSE, stringsAsFactors = FALSE)
 df.val$prop <- (df.val$V1 / (sum(df.val$V1))) * 100
 df.val$PC <- paste0("PC", row.names(df.val))
 
 #add depth
 depth.path <- paste0(vcf_prefix, ".idepth")
-df.depth <- read.table(depth.path, header = T)
+df.depth <- read_table_preserve_ids(depth.path, header = T, id_cols = c("INDV"))
 df.depth <- df.depth %>%  mutate_if(is.numeric, round, digits = 2)
 
 df.pca <- left_join(df.pca, df.depth, by = c("IID" = "INDV"))
@@ -229,7 +273,7 @@ Row
 # depth plot --------------------------------------------------------------
 #depth is read in the PCA chunk
 imiss.path <- paste0(vcf_prefix, ".imiss")
-df.imiss <- read.table(imiss.path, header =T)
+df.imiss <- read_table_preserve_ids(imiss.path, header =T, id_cols = c("INDV", "FID"))
 
 df.depth.miss <- inner_join(df.depth, df.imiss, by = "INDV")
 df.depth.miss <- df.depth.miss %>% 
@@ -263,7 +307,7 @@ df.depth.miss <- df.depth.miss %>%
 # bamstat plot --------------------------------------------------------------
 # v2 qc_report.tsv uses lowercase column names: sample, percent_mapped, mean_depth, etc.
 bamstat.path <- file.path(qc_dir, "qc_report.tsv")
-df.bamstat <- read.table(bamstat.path, header = T, sep = "\t")
+df.bamstat <- read_table_preserve_ids(bamstat.path, header = T, sep = "\t", id_cols = c("sample"))
 
 df.depth.miss.bamstat <- inner_join(df.depth.miss, df.bamstat, by = c("INDV" = "sample"))
 df.depth.miss.bamstat <- df.depth.miss.bamstat %>% 
@@ -291,7 +335,7 @@ cat("*Mapping rate panel skipped — no BAM summary stats provided (qc_report co
 
 ```{r, fig.width = 5, fig.height = 5}
 het.path <- paste0(vcf_prefix, ".het")
-df.het <- read.table(het.path, header =T)
+df.het <- read_table_preserve_ids(het.path, header =T, id_cols = c("INDV"))
 
 df.het <- left_join(df.het, df.pca, by = c("INDV" = "IID"))
 
@@ -339,10 +383,10 @@ Row
 
 ```{r, fig.height=12, fig.width=12}
 
-dist.id <- read.table(paste0(plink_prefix, ".dist.id"))["V2"]
+dist.id <- read_table_preserve_ids(paste0(plink_prefix, ".dist.id"), id_cols = c("V1", "V2"))["V2"]
 dist.id <- left_join(dist.id, df.pca[,c("IID","cluster")], by = c("V2" = "IID"))
 
-df.dist <- read.table(paste0(plink_prefix, ".dist"))
+df.dist <- read.table(paste0(plink_prefix, ".dist"), check.names = FALSE, stringsAsFactors = FALSE)
 
 mat.dist <- as.dist(df.dist)
 
@@ -400,7 +444,7 @@ Row
 
 ```{r,  fig.height=10, fig.width=6}
 
-df.rel <- read.table(paste0(plink_prefix, ".king"))
+df.rel <- read.table(paste0(plink_prefix, ".king"), check.names = FALSE, stringsAsFactors = FALSE)
 colnames(df.rel) <- dist.id$V2
 rownames(df.rel) <- dist.id$V2
 
@@ -465,7 +509,7 @@ Row
 coords.path <- file.path(qc_dir, "coords.txt")
 
 if((file.exists(coords.path)) & (file.size(coords.path)>0)){
-    df.coords <- read.table(coords.path)
+    df.coords <- read_table_preserve_ids(coords.path, id_cols = c("V1"))
     names(df.coords) <- c("sample.ID","long","lat")
     
     if(max(df.coords$lat < 90) & min(df.coords$lat) > 0 
@@ -537,7 +581,7 @@ coords.path <- file.path(qc_dir, "coords.txt")
 
 if(file.exists(coords.path)){
   if(!is.null(input$GMKey) && nchar(input$GMKey) > 0){
-    df.coords <- read.table(coords.path, na.strings = c("", "nan"))
+    df.coords <- read_table_preserve_ids(coords.path, na.strings = c("", "nan"), id_cols = c("V1"))
     names(df.coords) <- c("sample.ID","long","lat")
     
     df.coords <- left_join(df.pca, df.coords, by = c("IID" = "sample.ID"))
@@ -594,14 +638,14 @@ k2 <- paste0(plink_prefix, ".2.Q")
 k3 <- paste0(plink_prefix, ".3.Q")
 samps <- paste0(plink_prefix, ".fam")
 
-x <- read.table(k2, header = F)
+x <- read.table(k2, header = F, check.names = FALSE, stringsAsFactors = FALSE)
 
 struct_files <- c(k2,k3)
 cat_admx <- do.call("rbind",lapply(struct_files,
                                FUN=function(files){
-                                 x <- read.table(files, header = F)
+                                 x <- read.table(files, header = F, check.names = FALSE, stringsAsFactors = FALSE)
                                  names(x) <- gsub("V", "pop", names(x)) #rename ancestral pops
-                                 x.samps <- read.table(samps) %>% select(V2) #get sample names from .fam file
+                                 x.samps <- read_table_preserve_ids(samps, id_cols = c("V1", "V2")) %>% select(V2) #get sample names from .fam file
                                  x$sampleID <- x.samps$V2 #add sample name to df
                                  x$k <- gsub(".Q","",substr(files, nchar(files)-3+1, nchar(files)))
                                  x.long <- x %>% #pivot longer 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index c37b0327..484fdff7 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -390,7 +390,17 @@ def _parse_mark_duplicates(values, default):
     return pd.Series(parsed, index=values.index, dtype=bool)
 
 
-samples_df = pd.read_csv(config["samples"])
+SAMPLE_SHEET_DTYPES = {
+    "sample_id": "string",
+    "library_id": "string",
+}
+
+SAMPLE_METADATA_DTYPES = {
+    "sample_id": "string",
+}
+
+
+samples_df = pd.read_csv(config["samples"], dtype=SAMPLE_SHEET_DTYPES)
 global_mark_duplicates = bool(config["reads"]["mark_duplicates"])
 
 if "library_id" not in samples_df.columns:
@@ -685,7 +695,10 @@ def _parse_bool_column(series, column_name):
 metadata_df = None
 
 if config.get("sample_metadata"):
-    metadata_df = pd.read_csv(config["sample_metadata"])
+    metadata_df = pd.read_csv(
+        config["sample_metadata"],
+        dtype=SAMPLE_METADATA_DTYPES,
+    )
     validate(metadata_df, Path(workflow.basedir, "schemas/sample_metadata.schema.yaml"))
 
     # Validate sample_id values exist in the main sample sheet

From fde9e9f3fb49095c2914d61271d8bf17f1fd9a5d Mon Sep 17 00:00:00 2001
From: Tim Sackton <timsackton@gmail.com>
Date: Mon, 6 Apr 2026 20:10:38 -0400
Subject: [PATCH 2/6] Trim trailing whitespace in setup docs

---
 docs/setup.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/setup.md b/docs/setup.md
index 539dbcf2..985c347c 100644
--- a/docs/setup.md
+++ b/docs/setup.md
@@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here
 ```{note}
 Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option.
 ```
-

From ffe9d1fd0f1898d8a1efb392a5a82842aa243544 Mon Sep 17 00:00:00 2001
From: Tim Sackton <timsackton@gmail.com>
Date: Mon, 6 Apr 2026 20:11:03 -0400
Subject: [PATCH 3/6] Add trailing newline to unit tests

---
 tests/unit_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index aef78cc4..0a85eaeb 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -375,4 +375,4 @@ def test_coverage_bed(request):
             fields = line.split("\t")
             assert len(fields) == 3, f"BED line should have 3 fields: {line}"
             assert fields[0] == "chr2l", f"Expected contig chr2l: {line}"
-            assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"
\ No newline at end of file
+            assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"

From 441c13397ef987a35dfbcf1f81ba2bf18c15f174 Mon Sep 17 00:00:00 2001
From: Tim Sackton <tsackton@g.harvard.edu>
Date: Tue, 7 Apr 2026 10:42:05 -0400
Subject: [PATCH 4/6] Fix QC numeric ID preservation and add regression test

---
 tests/tests.py                                | 75 +++++++++++++++++++
 .../qc/scripts/qc_dashboard_interactive.Rmd   | 63 ++++++++++------
 2 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index 050bc15e..b35e3ba2 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -5,6 +5,7 @@
 import re
 import shutil
 import socket
+import subprocess
 import tempfile
 import threading
 from contextlib import contextmanager
@@ -266,6 +267,36 @@ def get_vcf_contig_headers(path):
     return contigs
 
 
+def extract_r_function_source(path, function_name):
+    """Extract an R function body from an Rmd file by balanced braces."""
+    text = Path(path).read_text()
+    marker = f"{function_name} <- function"
+    start = text.find(marker)
+    if start == -1:
+        raise AssertionError(f"Function '{function_name}' not found in {path}")
+
+    brace_start = text.find("{", start)
+    if brace_start == -1:
+        raise AssertionError(f"Could not find opening brace for '{function_name}' in {path}")
+
+    depth = 0
+    end = None
+    for idx in range(brace_start, len(text)):
+        char = text[idx]
+        if char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0:
+                end = idx + 1
+                break
+
+    if end is None:
+        raise AssertionError(f"Could not find closing brace for '{function_name}' in {path}")
+
+    return text[start:end]
+
+
 def write_reference_source_config(base_config, out_dir, *, source):
     """Write a config copy with reference.source overridden."""
     text = Path(base_config).read_text()
@@ -1395,6 +1426,50 @@ def test_write_numeric_qc_inputs_rewrites_contigs():
         assert fai_lines[:2] == ["1\t1\t3", "2\t1\t3"]
 
 
+def test_qc_dashboard_helper_preserves_numeric_like_ids():
+    """QC dashboard helper should preserve leading zeros for headered and headerless tables."""
+    if shutil.which("Rscript") is None:
+        pytest.skip("Rscript is not available")
+
+    helper_source = extract_r_function_source(
+        WORKFLOW_DIR / "modules" / "qc" / "scripts" / "qc_dashboard_interactive.Rmd",
+        "read_table_preserve_ids",
+    )
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_path = Path(tmpdir)
+        headerless = tmp_path / "dist.id"
+        headerless.write_text("0 000123\n0 000456\n")
+        headered = tmp_path / "depth.tsv"
+        headered.write_text("INDV\tMEAN_DEPTH\n000123\t5.2\n")
+
+        script = tmp_path / "validate_read_table_preserve_ids.R"
+        script.write_text(
+            "\n".join(
+                [
+                    "args <- commandArgs(trailingOnly = TRUE)",
+                    "headerless <- args[[1]]",
+                    "headered <- args[[2]]",
+                    helper_source,
+                    "headerless_df <- read_table_preserve_ids(headerless, id_cols = c('V1', 'V2'))",
+                    "if (!is.character(headerless_df$V1) || !is.character(headerless_df$V2)) stop('Headerless ID columns were not read as character')",
+                    "if (!identical(headerless_df$V2[[1]], '000123')) stop('Headerless leading zeros were not preserved')",
+                    "headered_df <- read_table_preserve_ids(headered, header = TRUE, sep = '\\t', id_cols = c('INDV'))",
+                    "if (!is.character(headered_df$INDV)) stop('Headered ID column was not read as character')",
+                    "if (!identical(headered_df$INDV[[1]], '000123')) stop('Headered leading zeros were not preserved')",
+                ]
+            )
+        )
+
+        result = subprocess.run(
+            ["Rscript", str(script), str(headerless), str(headered)],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        assert result.returncode == 0, (result.stdout + result.stderr).strip()
+
+
 @pytest.mark.full_run
 def test_qc_standalone_full_run(request):
     """Full execution of QC module as standalone workflow against test fixtures."""
diff --git a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
index cdc35e6f..7c674969 100755
--- a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
+++ b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd
@@ -39,40 +39,61 @@ read_table_preserve_ids <- function(file = NULL, text = NULL, id_cols = characte
     stop("Provide file or text.")
   }
 
+  read_args <- list(...)
+  if (is.null(read_args$check.names)) {
+    read_args$check.names <- FALSE
+  }
+  if (is.null(read_args$stringsAsFactors)) {
+    read_args$stringsAsFactors <- FALSE
+  }
+
   first_line <- if (!is.null(file)) {
     readLines(file, n = 1)
   } else {
     strsplit(text, "\n", fixed = TRUE)[[1]][1]
   }
+  if (length(first_line) == 0 || !nzchar(first_line)) {
+    stop("Cannot infer columns from an empty table.")
+  }
 
-  header_df <- read.table(
-    text = first_line,
-    header = TRUE,
-    check.names = FALSE,
-    stringsAsFactors = FALSE
-  )
-
-  col_classes <- rep(NA_character_, ncol(header_df))
-  names(col_classes) <- names(header_df)
-  col_classes[names(col_classes) %in% id_cols] <- "character"
+  header <- isTRUE(read_args$header)
+  sep <- read_args$sep
+  if (is.null(sep)) {
+    sep <- ""
+  }
 
-  if (!is.null(file)) {
-    read.table(
-      file,
-      colClasses = col_classes,
+  if (header) {
+    preview_df <- read.table(
+      text = first_line,
+      header = TRUE,
+      sep = sep,
       check.names = FALSE,
-      stringsAsFactors = FALSE,
-      ...
+      stringsAsFactors = FALSE
     )
+    col_names <- names(preview_df)
   } else {
-    read.table(
-      text = text,
-      colClasses = col_classes,
+    preview_df <- read.table(
+      text = first_line,
+      header = FALSE,
+      sep = sep,
       check.names = FALSE,
-      stringsAsFactors = FALSE,
-      ...
+      stringsAsFactors = FALSE
     )
+    col_names <- paste0("V", seq_len(ncol(preview_df)))
   }
+
+  col_classes <- rep(NA_character_, length(col_names))
+  names(col_classes) <- col_names
+  col_classes[names(col_classes) %in% id_cols] <- "character"
+  read_args$colClasses <- col_classes
+
+  if (!is.null(file)) {
+    read_args$file <- file
+  } else {
+    read_args$text <- text
+  }
+
+  do.call(read.table, read_args)
 }
 
 ```

From 2d39d2d2042ffc8d2882a8568ab25828c7daabe9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 7 Apr 2026 14:53:56 +0000
Subject: [PATCH 5/6] fix: intersect dtype map with actual CSV columns in qc
 and postprocess Snakefiles

Agent-Logs-Url: https://github.com/harvardinformatics/snparcher/sessions/ab77797b-0889-48eb-8ac3-fd6d1ca54c86

Co-authored-by: tsackton <7772796+tsackton@users.noreply.github.com>
---
 workflow/modules/postprocess/Snakefile | 8 +++++++-
 workflow/modules/qc/Snakefile          | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile
index f2a746ef..c378a92e 100644
--- a/workflow/modules/postprocess/Snakefile
+++ b/workflow/modules/postprocess/Snakefile
@@ -58,7 +58,13 @@ _SAMPLE_METADATA_DTYPES = {
 }
 
 
-_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
+def _read_csv_safe(path, dtype_map):
+    """Read a CSV, restricting dtype overrides to columns actually present."""
+    present = set(pd.read_csv(path, nrows=0).columns)
+    return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present})
+
+
+_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile
index 61c3eff2..402aae4e 100644
--- a/workflow/modules/qc/Snakefile
+++ b/workflow/modules/qc/Snakefile
@@ -58,7 +58,13 @@ _SAMPLE_METADATA_DTYPES = {
 }
 
 
-_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
+def _read_csv_safe(path, dtype_map):
+    """Read a CSV, restricting dtype overrides to columns actually present."""
+    present = set(pd.read_csv(path, nrows=0).columns)
+    return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present})
+
+
+_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None

From c1cffd397f30c81129028692358202a8b7fcc66b Mon Sep 17 00:00:00 2001
From: Tim Sackton <tsackton@g.harvard.edu>
Date: Tue, 7 Apr 2026 11:57:39 -0400
Subject: [PATCH 6/6] Revert "fix: intersect dtype map with actual CSV columns
 in qc and postprocess Snakefiles"

This reverts commit 2d39d2d2042ffc8d2882a8568ab25828c7daabe9.
---
 workflow/modules/postprocess/Snakefile | 8 +-------
 workflow/modules/qc/Snakefile          | 8 +-------
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile
index c378a92e..f2a746ef 100644
--- a/workflow/modules/postprocess/Snakefile
+++ b/workflow/modules/postprocess/Snakefile
@@ -58,13 +58,7 @@ _SAMPLE_METADATA_DTYPES = {
 }
 
 
-def _read_csv_safe(path, dtype_map):
-    """Read a CSV, restricting dtype overrides to columns actually present."""
-    present = set(pd.read_csv(path, nrows=0).columns)
-    return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present})
-
-
-_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES)
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None
diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile
index 402aae4e..61c3eff2 100644
--- a/workflow/modules/qc/Snakefile
+++ b/workflow/modules/qc/Snakefile
@@ -58,13 +58,7 @@ _SAMPLE_METADATA_DTYPES = {
 }
 
 
-def _read_csv_safe(path, dtype_map):
-    """Read a CSV, restricting dtype overrides to columns actually present."""
-    present = set(pd.read_csv(path, nrows=0).columns)
-    return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present})
-
-
-_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES)
+_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
 _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()
 
 _metadata_df = None