From 214a1637fa31239808888d8c0ffc3f057a78aafa Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Mon, 6 Apr 2026 20:10:07 -0400 Subject: [PATCH 1/6] Preserve numeric-like sample identifiers --- tests/tests.py | 36 +++++++- workflow/modules/postprocess/Snakefile | 14 +++- workflow/modules/qc/Snakefile | 14 +++- .../qc/scripts/qc_dashboard_interactive.Rmd | 82 ++++++++++++++----- workflow/rules/common.smk | 17 +++- 5 files changed, 135 insertions(+), 28 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index f8dc3202..050bc15e 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -291,6 +291,16 @@ def write_gvcf_sample_sheet(out_dir, *, sample_id, gvcf_path): return out_path +def write_fastq_sample_sheet(out_dir, *, sample_id, library_id): + """Write a one-row FASTQ sample sheet with explicit sample and library IDs.""" + out_path = Path(out_dir) / "numeric_id_fastqs.csv" + out_path.write_text( + "sample_id,input_type,input,library_id,mark_duplicates\n" + f"{sample_id},fastq,tests/data/fastq/sample1_1.fastq.gz;tests/data/fastq/sample1_2.fastq.gz,{library_id},true\n" + ) + return out_path + + @contextmanager def serve_directory(directory): """Serve a directory over HTTP for reference URL tests.""" @@ -698,7 +708,8 @@ def test_reference_url_sources(request, compressed): @pytest.mark.full_run @pytest.mark.parametrize("intervals_enabled", [False, True]) -def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled): +@pytest.mark.parametrize("sample_id", ["sample_gvcf", "00123"]) +def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled, sample_id): no_conda = request.config.getoption("--no-conda") with tempfile.TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) @@ -715,7 +726,7 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_ cfg = write_intervals_config(cfg, tmpdir, enabled=intervals_enabled) samples = write_gvcf_sample_sheet( tmpdir, - sample_id="sample_gvcf", + sample_id=sample_id, gvcf_path=gvcf_path, ) @@ -727,7 +738,26 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_ result.assert_success() mapfile = (tmp_path / "results/genomics_db/mapfile.txt").read_text().strip() - assert mapfile == f"sample_gvcf\t{gvcf_path}" + assert mapfile == f"{sample_id}\t{gvcf_path}" + + +@pytest.mark.dry_run +def test_fastq_dry_run_accepts_numeric_like_sample_and_library_ids(request): + no_conda = request.config.getoption("--no-conda") + with tempfile.TemporaryDirectory() as tmpdir: + smk = SnakemakeRunner(Path(tmpdir), use_conda=not no_conda) + samples = write_fastq_sample_sheet(tmpdir, sample_id="00123", library_id="456") + + result = smk.dry_run( + target=[ + "results/filtered_fastqs/00123/456/u1_1.fastq.gz", + "results/filtered_fastqs/00123/456/u1_2.fastq.gz", + ], + configfile=get_config_file(), + samples=samples, + ) + + result.assert_success() @pytest.mark.full_run diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile index abbc5067..f2a746ef 100644 --- a/workflow/modules/postprocess/Snakefile +++ b/workflow/modules/postprocess/Snakefile @@ -48,13 +48,23 @@ for _k, _v in _PP_DEFAULTS.items(): # ---------- Sample sheet + metadata ------------------------------------------ -_samples_df = pd.read_csv(config["samples"]) +_SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +_SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None _meta_path = config.get("sample_metadata", "") if _meta_path: - _metadata_df = pd.read_csv(_meta_path) + _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES) def _parse_bool(series, col_name): diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile index e7571163..61c3eff2 100644 --- a/workflow/modules/qc/Snakefile +++ b/workflow/modules/qc/Snakefile @@ -48,13 +48,23 @@ for _k, _v in _QC_DEFAULTS.items(): # ---------- Sample sheet + metadata ------------------------------------------ -_samples_df = pd.read_csv(config["samples"]) +_SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +_SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None _meta_path = config.get("sample_metadata", "") if _meta_path: - _metadata_df = pd.read_csv(_meta_path) + _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES) def _has_coords(): diff --git a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd index 5279f725..cdc35e6f 100755 --- a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd +++ b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd @@ -31,6 +31,50 @@ plink_prefix <- file.path(qc_dir, "plink") set.colors <- c('#1f78b4','#33a02c','#6a3d9a','#a6cee3','#b2df8a','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6') +read_table_preserve_ids <- function(file = NULL, text = NULL, id_cols = character(), ...) { + if (!is.null(file) && !is.null(text)) { + stop("Provide either file or text, not both.") + } + if (is.null(file) && is.null(text)) { + stop("Provide file or text.") + } + + first_line <- if (!is.null(file)) { + readLines(file, n = 1) + } else { + strsplit(text, "\n", fixed = TRUE)[[1]][1] + } + + header_df <- read.table( + text = first_line, + header = TRUE, + check.names = FALSE, + stringsAsFactors = FALSE + ) + + col_classes <- rep(NA_character_, ncol(header_df)) + names(col_classes) <- names(header_df) + col_classes[names(col_classes) %in% id_cols] <- "character" + + if (!is.null(file)) { + read.table( + file, + colClasses = col_classes, + check.names = FALSE, + stringsAsFactors = FALSE, + ... + ) + } else { + read.table( + text = text, + colClasses = col_classes, + check.names = FALSE, + stringsAsFactors = FALSE, + ... + ) + } +} + ``` @@ -43,12 +87,12 @@ Row ```{r numsnps} #fai file for getting genome length fai_path <- file.path(qc_dir, "ref.fai") -df_fai <- read.table(fai_path, header = F) +df_fai <- read.table(fai_path, header = F, check.names = FALSE, stringsAsFactors = FALSE) genome_size <- sum(df_fai$V2) #summary of the unfiltered VCF sum_path <- paste0(vcf_prefix, ".FILTER.summary") -df_sum <- read.table(sum_path, header =T) +df_sum <- read.table(sum_path, header =T, check.names = FALSE, stringsAsFactors = FALSE) #total number of SNPs found by pipeline first_line <- df_sum %>% filter(FILTER == ".") @@ -65,10 +109,10 @@ tot_snps <- snps_remain + fil_snps proj_name <- basename(qc_dir) #num SNPs in pruned data -nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T)) +nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T, check.names = FALSE, stringsAsFactors = FALSE)) #estimate watterson's theta: -num.samples <- nrow(read.table(paste0(vcf_prefix, ".idepth"), header = T)) +num.samples <- nrow(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV"))) #harmonic number calculation for n-1 chromosomes (2*sample size) Hn = 0 @@ -105,7 +149,7 @@ valueBox(div_text, icon = "fa-dna") ### Mean depth ```{r} -mean.depth <- round(mean(read.table(paste0(vcf_prefix, ".idepth"), header = T)$MEAN_DEPTH), 1) +mean.depth <- round(mean(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV"))$MEAN_DEPTH), 1) valueBox(mean.depth, icon = "fa-align-center") ``` @@ -138,14 +182,14 @@ input$GMKey <- params$GMKey pca.path <- paste0(plink_prefix, ".eigenvec") #this makes it reasonably robust to running with plink 1.9 or plink 2.0 tmp.head <- sub("#", "", readLines(pca.path)) -df.pca <- read.table(text = tmp.head, header = TRUE) -df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE) +df.pca <- read_table_preserve_ids(text = tmp.head, header = TRUE, id_cols = c("FID", "IID")) +df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE, check.names = FALSE, stringsAsFactors = FALSE) df.val$prop <- (df.val$V1 / (sum(df.val$V1))) * 100 df.val$PC <- paste0("PC", row.names(df.val)) #add depth depth.path <- paste0(vcf_prefix, ".idepth") -df.depth <- read.table(depth.path, header = T) +df.depth <- read_table_preserve_ids(depth.path, header = T, id_cols = c("INDV")) df.depth <- df.depth %>% mutate_if(is.numeric, round, digits = 2) df.pca <- left_join(df.pca, df.depth, by = c("IID" = "INDV")) @@ -229,7 +273,7 @@ Row # depth plot -------------------------------------------------------------- #depth is read in the PCA chunk imiss.path <- paste0(vcf_prefix, ".imiss") -df.imiss <- read.table(imiss.path, header =T) +df.imiss <- read_table_preserve_ids(imiss.path, header =T, id_cols = c("INDV", "FID")) df.depth.miss <- inner_join(df.depth, df.imiss, by = "INDV") df.depth.miss <- df.depth.miss %>% @@ -263,7 +307,7 @@ df.depth.miss <- df.depth.miss %>% # bamstat plot -------------------------------------------------------------- # v2 qc_report.tsv uses lowercase column names: sample, percent_mapped, mean_depth, etc. bamstat.path <- file.path(qc_dir, "qc_report.tsv") -df.bamstat <- read.table(bamstat.path, header = T, sep = "\t") +df.bamstat <- read_table_preserve_ids(bamstat.path, header = T, sep = "\t", id_cols = c("sample")) df.depth.miss.bamstat <- inner_join(df.depth.miss, df.bamstat, by = c("INDV" = "sample")) df.depth.miss.bamstat <- df.depth.miss.bamstat %>% @@ -291,7 +335,7 @@ cat("*Mapping rate panel skipped — no BAM summary stats provided (qc_report co ```{r, fig.width = 5, fig.height = 5} het.path <- paste0(vcf_prefix, ".het") -df.het <- read.table(het.path, header =T) +df.het <- read_table_preserve_ids(het.path, header =T, id_cols = c("INDV")) df.het <- left_join(df.het, df.pca, by = c("INDV" = "IID")) @@ -339,10 +383,10 @@ Row ```{r, fig.height=12, fig.width=12} -dist.id <- read.table(paste0(plink_prefix, ".dist.id"))["V2"] +dist.id <- read_table_preserve_ids(paste0(plink_prefix, ".dist.id"), id_cols = c("V1", "V2"))["V2"] dist.id <- left_join(dist.id, df.pca[,c("IID","cluster")], by = c("V2" = "IID")) -df.dist <- read.table(paste0(plink_prefix, ".dist")) +df.dist <- read.table(paste0(plink_prefix, ".dist"), check.names = FALSE, stringsAsFactors = FALSE) mat.dist <- as.dist(df.dist) @@ -400,7 +444,7 @@ Row ```{r, fig.height=10, fig.width=6} -df.rel <- read.table(paste0(plink_prefix, ".king")) +df.rel <- read.table(paste0(plink_prefix, ".king"), check.names = FALSE, stringsAsFactors = FALSE) colnames(df.rel) <- dist.id$V2 rownames(df.rel) <- dist.id$V2 @@ -465,7 +509,7 @@ Row coords.path <- file.path(qc_dir, "coords.txt") if((file.exists(coords.path)) & (file.size(coords.path)>0)){ - df.coords <- read.table(coords.path) + df.coords <- read_table_preserve_ids(coords.path, id_cols = c("V1")) names(df.coords) <- c("sample.ID","long","lat") if(max(df.coords$lat < 90) & min(df.coords$lat) > 0 @@ -537,7 +581,7 @@ coords.path <- file.path(qc_dir, "coords.txt") if(file.exists(coords.path)){ if(!is.null(input$GMKey) && nchar(input$GMKey) > 0){ - df.coords <- read.table(coords.path, na.strings = c("", "nan")) + df.coords <- read_table_preserve_ids(coords.path, na.strings = c("", "nan"), id_cols = c("V1")) names(df.coords) <- c("sample.ID","long","lat") df.coords <- left_join(df.pca, df.coords, by = c("IID" = "sample.ID")) @@ -594,14 +638,14 @@ k2 <- paste0(plink_prefix, ".2.Q") k3 <- paste0(plink_prefix, ".3.Q") samps <- paste0(plink_prefix, ".fam") -x <- read.table(k2, header = F) +x <- read.table(k2, header = F, check.names = FALSE, stringsAsFactors = FALSE) struct_files <- c(k2,k3) cat_admx <- do.call("rbind",lapply(struct_files, FUN=function(files){ - x <- read.table(files, header = F) + x <- read.table(files, header = F, check.names = FALSE, stringsAsFactors = FALSE) names(x) <- gsub("V", "pop", names(x)) #rename ancestral pops - x.samps <- read.table(samps) %>% select(V2) #get sample names from .fam file + x.samps <- read_table_preserve_ids(samps, id_cols = c("V1", "V2")) %>% select(V2) #get sample names from .fam file x$sampleID <- x.samps$V2 #add sample name to df x$k <- gsub(".Q","",substr(files, nchar(files)-3+1, nchar(files))) x.long <- x %>% #pivot longer diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c37b0327..484fdff7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -390,7 +390,17 @@ def _parse_mark_duplicates(values, default): return pd.Series(parsed, index=values.index, dtype=bool) -samples_df = pd.read_csv(config["samples"]) +SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +samples_df = pd.read_csv(config["samples"], dtype=SAMPLE_SHEET_DTYPES) global_mark_duplicates = bool(config["reads"]["mark_duplicates"]) if "library_id" not in samples_df.columns: @@ -685,7 +695,10 @@ def _parse_bool_column(series, column_name): metadata_df = None if config.get("sample_metadata"): - metadata_df = pd.read_csv(config["sample_metadata"]) + metadata_df = pd.read_csv( + config["sample_metadata"], + dtype=SAMPLE_METADATA_DTYPES, + ) validate(metadata_df, Path(workflow.basedir, "schemas/sample_metadata.schema.yaml")) # Validate sample_id values exist in the main sample sheet From fde9e9f3fb49095c2914d61271d8bf17f1fd9a5d Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Mon, 6 Apr 2026 20:10:38 -0400 Subject: [PATCH 2/6] Trim trailing whitespace in setup docs --- docs/setup.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/setup.md b/docs/setup.md index 539dbcf2..985c347c 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here ```{note} Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option. ``` - From ffe9d1fd0f1898d8a1efb392a5a82842aa243544 Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Mon, 6 Apr 2026 20:11:03 -0400 Subject: [PATCH 3/6] Add trailing newline to unit tests --- tests/unit_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index aef78cc4..0a85eaeb 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -375,4 +375,4 @@ def test_coverage_bed(request): fields = line.split("\t") assert len(fields) == 3, f"BED line should have 3 fields: {line}" assert fields[0] == "chr2l", f"Expected contig chr2l: {line}" - assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}" \ No newline at end of file + assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}" From 441c13397ef987a35dfbcf1f81ba2bf18c15f174 Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Tue, 7 Apr 2026 10:42:05 -0400 Subject: [PATCH 4/6] Fix QC numeric ID preservation and add regression test --- tests/tests.py | 75 +++++++++++++++++++ .../qc/scripts/qc_dashboard_interactive.Rmd | 63 ++++++++++------ 2 files changed, 117 insertions(+), 21 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index 050bc15e..b35e3ba2 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -5,6 +5,7 @@ import re import shutil import socket +import subprocess import tempfile import threading from contextlib import contextmanager @@ -266,6 +267,36 @@ def get_vcf_contig_headers(path): return contigs +def extract_r_function_source(path, function_name): + """Extract an R function body from an Rmd file by balanced braces.""" + text = Path(path).read_text() + marker = f"{function_name} <- function" + start = text.find(marker) + if start == -1: + raise AssertionError(f"Function '{function_name}' not found in {path}") + + brace_start = text.find("{", start) + if brace_start == -1: + raise AssertionError(f"Could not find opening brace for '{function_name}' in {path}") + + depth = 0 + end = None + for idx in range(brace_start, len(text)): + char = text[idx] + if char == "{": + depth += 1 + elif char == "}": + depth -= 1 + if depth == 0: + end = idx + 1 + break + + if end is None: + raise AssertionError(f"Could not find closing brace for '{function_name}' in {path}") + + return text[start:end] + + def write_reference_source_config(base_config, out_dir, *, source): """Write a config copy with reference.source overridden.""" text = Path(base_config).read_text() @@ -1395,6 +1426,50 @@ def test_write_numeric_qc_inputs_rewrites_contigs(): assert fai_lines[:2] == ["1\t1\t3", "2\t1\t3"] +def test_qc_dashboard_helper_preserves_numeric_like_ids(): + """QC dashboard helper should preserve leading zeros for headered and headerless tables.""" + if shutil.which("Rscript") is None: + pytest.skip("Rscript is not available") + + helper_source = extract_r_function_source( + WORKFLOW_DIR / "modules" / "qc" / "scripts" / "qc_dashboard_interactive.Rmd", + "read_table_preserve_ids", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + headerless = tmp_path / "dist.id" + headerless.write_text("0 000123\n0 000456\n") + headered = tmp_path / "depth.tsv" + headered.write_text("INDV\tMEAN_DEPTH\n000123\t5.2\n") + + script = tmp_path / "validate_read_table_preserve_ids.R" + script.write_text( + "\n".join( + [ + "args <- commandArgs(trailingOnly = TRUE)", + "headerless <- args[[1]]", + "headered <- args[[2]]", + helper_source, + "headerless_df <- read_table_preserve_ids(headerless, id_cols = c('V1', 'V2'))", + "if (!is.character(headerless_df$V1) || !is.character(headerless_df$V2)) stop('Headerless ID columns were not read as character')", + "if (!identical(headerless_df$V2[[1]], '000123')) stop('Headerless leading zeros were not preserved')", + "headered_df <- read_table_preserve_ids(headered, header = TRUE, sep = '\\t', id_cols = c('INDV'))", + "if (!is.character(headered_df$INDV)) stop('Headered ID column was not read as character')", + "if (!identical(headered_df$INDV[[1]], '000123')) stop('Headered leading zeros were not preserved')", + ] + ) + ) + + result = subprocess.run( + ["Rscript", str(script), str(headerless), str(headered)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, (result.stdout + result.stderr).strip() + + @pytest.mark.full_run def test_qc_standalone_full_run(request): """Full execution of QC module as standalone workflow against test fixtures.""" diff --git a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd index cdc35e6f..7c674969 100755 --- a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd +++ b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd @@ -39,40 +39,61 @@ read_table_preserve_ids <- function(file = NULL, text = NULL, id_cols = characte stop("Provide file or text.") } + read_args <- list(...) + if (is.null(read_args$check.names)) { + read_args$check.names <- FALSE + } + if (is.null(read_args$stringsAsFactors)) { + read_args$stringsAsFactors <- FALSE + } + first_line <- if (!is.null(file)) { readLines(file, n = 1) } else { strsplit(text, "\n", fixed = TRUE)[[1]][1] } + if (length(first_line) == 0 || !nzchar(first_line)) { + stop("Cannot infer columns from an empty table.") + } - header_df <- read.table( - text = first_line, - header = TRUE, - check.names = FALSE, - stringsAsFactors = FALSE - ) - - col_classes <- rep(NA_character_, ncol(header_df)) - names(col_classes) <- names(header_df) - col_classes[names(col_classes) %in% id_cols] <- "character" + header <- isTRUE(read_args$header) + sep <- read_args$sep + if (is.null(sep)) { + sep <- "" + } - if (!is.null(file)) { - read.table( - file, - colClasses = col_classes, + if (header) { + preview_df <- read.table( + text = first_line, + header = TRUE, + sep = sep, check.names = FALSE, - stringsAsFactors = FALSE, - ... + stringsAsFactors = FALSE ) + col_names <- names(preview_df) } else { - read.table( - text = text, - colClasses = col_classes, + preview_df <- read.table( + text = first_line, + header = FALSE, + sep = sep, check.names = FALSE, - stringsAsFactors = FALSE, - ... + stringsAsFactors = FALSE ) + col_names <- paste0("V", seq_len(ncol(preview_df))) } + + col_classes <- rep(NA_character_, length(col_names)) + names(col_classes) <- col_names + col_classes[names(col_classes) %in% id_cols] <- "character" + read_args$colClasses <- col_classes + + if (!is.null(file)) { + read_args$file <- file + } else { + read_args$text <- text + } + + do.call(read.table, read_args) } ``` From 2d39d2d2042ffc8d2882a8568ab25828c7daabe9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 14:53:56 +0000 Subject: [PATCH 5/6] fix: intersect dtype map with actual CSV columns in qc and postprocess Snakefiles Agent-Logs-Url: https://github.com/harvardinformatics/snparcher/sessions/ab77797b-0889-48eb-8ac3-fd6d1ca54c86 Co-authored-by: tsackton <7772796+tsackton@users.noreply.github.com> --- workflow/modules/postprocess/Snakefile | 8 +++++++- workflow/modules/qc/Snakefile | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile index f2a746ef..c378a92e 100644 --- a/workflow/modules/postprocess/Snakefile +++ b/workflow/modules/postprocess/Snakefile @@ -58,7 +58,13 @@ _SAMPLE_METADATA_DTYPES = { } -_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) +def _read_csv_safe(path, dtype_map): + """Read a CSV, restricting dtype overrides to columns actually present.""" + present = set(pd.read_csv(path, nrows=0).columns) + return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present}) + + +_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile index 61c3eff2..402aae4e 100644 --- a/workflow/modules/qc/Snakefile +++ b/workflow/modules/qc/Snakefile @@ -58,7 +58,13 @@ _SAMPLE_METADATA_DTYPES = { } -_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) +def _read_csv_safe(path, dtype_map): + """Read a CSV, restricting dtype overrides to columns actually present.""" + present = set(pd.read_csv(path, nrows=0).columns) + return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present}) + + +_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None From c1cffd397f30c81129028692358202a8b7fcc66b Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Tue, 7 Apr 2026 11:57:39 -0400 Subject: [PATCH 6/6] Revert "fix: intersect dtype map with actual CSV columns in qc and postprocess Snakefiles" This reverts commit 2d39d2d2042ffc8d2882a8568ab25828c7daabe9. --- workflow/modules/postprocess/Snakefile | 8 +------- workflow/modules/qc/Snakefile | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile index c378a92e..f2a746ef 100644 --- a/workflow/modules/postprocess/Snakefile +++ b/workflow/modules/postprocess/Snakefile @@ -58,13 +58,7 @@ _SAMPLE_METADATA_DTYPES = { } -def _read_csv_safe(path, dtype_map): - """Read a CSV, restricting dtype overrides to columns actually present.""" - present = set(pd.read_csv(path, nrows=0).columns) - return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present}) - - -_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES) +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile index 402aae4e..61c3eff2 100644 --- a/workflow/modules/qc/Snakefile +++ b/workflow/modules/qc/Snakefile @@ -58,13 +58,7 @@ _SAMPLE_METADATA_DTYPES = { } -def _read_csv_safe(path, dtype_map): - """Read a CSV, restricting dtype overrides to columns actually present.""" - present = set(pd.read_csv(path, nrows=0).columns) - return pd.read_csv(path, dtype={k: v for k, v in dtype_map.items() if k in present}) - - -_samples_df = _read_csv_safe(config["samples"], _SAMPLE_SHEET_DTYPES) +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None