diff --git a/docs/setup.md b/docs/setup.md index 539dbcf2..985c347c 100644 --- a/docs/setup.md +++ b/docs/setup.md @@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here ```{note} Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option. ``` - diff --git a/tests/tests.py b/tests/tests.py index f8dc3202..b35e3ba2 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -5,6 +5,7 @@ import re import shutil import socket +import subprocess import tempfile import threading from contextlib import contextmanager @@ -266,6 +267,36 @@ def get_vcf_contig_headers(path): return contigs +def extract_r_function_source(path, function_name): + """Extract an R function body from an Rmd file by balanced braces.""" + text = Path(path).read_text() + marker = f"{function_name} <- function" + start = text.find(marker) + if start == -1: + raise AssertionError(f"Function '{function_name}' not found in {path}") + + brace_start = text.find("{", start) + if brace_start == -1: + raise AssertionError(f"Could not find opening brace for '{function_name}' in {path}") + + depth = 0 + end = None + for idx in range(brace_start, len(text)): + char = text[idx] + if char == "{": + depth += 1 + elif char == "}": + depth -= 1 + if depth == 0: + end = idx + 1 + break + + if end is None: + raise AssertionError(f"Could not find closing brace for '{function_name}' in {path}") + + return text[start:end] + + def write_reference_source_config(base_config, out_dir, *, source): """Write a config copy with reference.source overridden.""" text = Path(base_config).read_text() @@ -291,6 +322,16 @@ def write_gvcf_sample_sheet(out_dir, *, sample_id, gvcf_path): return out_path +def write_fastq_sample_sheet(out_dir, *, sample_id, library_id): + """Write a one-row FASTQ sample sheet with explicit sample and library IDs.""" + out_path = Path(out_dir) / "numeric_id_fastqs.csv" + out_path.write_text( + "sample_id,input_type,input,library_id,mark_duplicates\n" + f"{sample_id},fastq,tests/data/fastq/sample1_1.fastq.gz;tests/data/fastq/sample1_2.fastq.gz,{library_id},true\n" + ) + return out_path + + @contextmanager def serve_directory(directory): """Serve a directory over HTTP for reference URL tests.""" @@ -698,7 +739,8 @@ def test_reference_url_sources(request, compressed): @pytest.mark.full_run @pytest.mark.parametrize("intervals_enabled", [False, True]) -def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled): +@pytest.mark.parametrize("sample_id", ["sample_gvcf", "00123"]) +def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled, sample_id): no_conda = request.config.getoption("--no-conda") with tempfile.TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) @@ -715,7 +757,7 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_ cfg = write_intervals_config(cfg, tmpdir, enabled=intervals_enabled) samples = write_gvcf_sample_sheet( tmpdir, - sample_id="sample_gvcf", + sample_id=sample_id, gvcf_path=gvcf_path, ) @@ -727,7 +769,26 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_ result.assert_success() mapfile = (tmp_path / "results/genomics_db/mapfile.txt").read_text().strip() - assert mapfile == f"sample_gvcf\t{gvcf_path}" + assert mapfile == f"{sample_id}\t{gvcf_path}" + + +@pytest.mark.dry_run +def test_fastq_dry_run_accepts_numeric_like_sample_and_library_ids(request): + no_conda = request.config.getoption("--no-conda") + with tempfile.TemporaryDirectory() as tmpdir: + smk = SnakemakeRunner(Path(tmpdir), use_conda=not no_conda) + samples = write_fastq_sample_sheet(tmpdir, sample_id="00123", library_id="456") + + result = smk.dry_run( + target=[ + "results/filtered_fastqs/00123/456/u1_1.fastq.gz", + "results/filtered_fastqs/00123/456/u1_2.fastq.gz", + ], + configfile=get_config_file(), + samples=samples, + ) + + result.assert_success() @pytest.mark.full_run @@ -1365,6 +1426,50 @@ def test_write_numeric_qc_inputs_rewrites_contigs(): assert fai_lines[:2] == ["1\t1\t3", "2\t1\t3"] +def test_qc_dashboard_helper_preserves_numeric_like_ids(): + """QC dashboard helper should preserve leading zeros for headered and headerless tables.""" + if shutil.which("Rscript") is None: + pytest.skip("Rscript is not available") + + helper_source = extract_r_function_source( + WORKFLOW_DIR / "modules" / "qc" / "scripts" / "qc_dashboard_interactive.Rmd", + "read_table_preserve_ids", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + tmp_path = Path(tmpdir) + headerless = tmp_path / "dist.id" + headerless.write_text("0 000123\n0 000456\n") + headered = tmp_path / "depth.tsv" + headered.write_text("INDV\tMEAN_DEPTH\n000123\t5.2\n") + + script = tmp_path / "validate_read_table_preserve_ids.R" + script.write_text( + "\n".join( + [ + "args <- commandArgs(trailingOnly = TRUE)", + "headerless <- args[[1]]", + "headered <- args[[2]]", + helper_source, + "headerless_df <- read_table_preserve_ids(headerless, id_cols = c('V1', 'V2'))", + "if (!is.character(headerless_df$V1) || !is.character(headerless_df$V2)) stop('Headerless ID columns were not read as character')", + "if (!identical(headerless_df$V2[[1]], '000123')) stop('Headerless leading zeros were not preserved')", + "headered_df <- read_table_preserve_ids(headered, header = TRUE, sep = '\\t', id_cols = c('INDV'))", + "if (!is.character(headered_df$INDV)) stop('Headered ID column was not read as character')", + "if (!identical(headered_df$INDV[[1]], '000123')) stop('Headered leading zeros were not preserved')", + ] + ) + ) + + result = subprocess.run( + ["Rscript", str(script), str(headerless), str(headered)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, (result.stdout + result.stderr).strip() + + @pytest.mark.full_run def test_qc_standalone_full_run(request): """Full execution of QC module as standalone workflow against test fixtures.""" diff --git a/tests/unit_tests.py b/tests/unit_tests.py index aef78cc4..0a85eaeb 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -375,4 +375,4 @@ def test_coverage_bed(request): fields = line.split("\t") assert len(fields) == 3, f"BED line should have 3 fields: {line}" assert fields[0] == "chr2l", f"Expected contig chr2l: {line}" - assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}" \ No newline at end of file + assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}" diff --git a/workflow/modules/postprocess/Snakefile b/workflow/modules/postprocess/Snakefile index abbc5067..f2a746ef 100644 --- a/workflow/modules/postprocess/Snakefile +++ b/workflow/modules/postprocess/Snakefile @@ -48,13 +48,23 @@ for _k, _v in _PP_DEFAULTS.items(): # ---------- Sample sheet + metadata ------------------------------------------ -_samples_df = pd.read_csv(config["samples"]) +_SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +_SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None _meta_path = config.get("sample_metadata", "") if _meta_path: - _metadata_df = pd.read_csv(_meta_path) + _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES) def _parse_bool(series, col_name): diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile index e7571163..61c3eff2 100644 --- a/workflow/modules/qc/Snakefile +++ b/workflow/modules/qc/Snakefile @@ -48,13 +48,23 @@ for _k, _v in _QC_DEFAULTS.items(): # ---------- Sample sheet + metadata ------------------------------------------ -_samples_df = pd.read_csv(config["samples"]) +_SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +_SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) _ALL_SAMPLES = _samples_df["sample_id"].unique().tolist() _metadata_df = None _meta_path = config.get("sample_metadata", "") if _meta_path: - _metadata_df = pd.read_csv(_meta_path) + _metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES) def _has_coords(): diff --git a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd index 5279f725..7c674969 100755 --- a/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd +++ b/workflow/modules/qc/scripts/qc_dashboard_interactive.Rmd @@ -31,6 +31,71 @@ plink_prefix <- file.path(qc_dir, "plink") set.colors <- c('#1f78b4','#33a02c','#6a3d9a','#a6cee3','#b2df8a','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6') +read_table_preserve_ids <- function(file = NULL, text = NULL, id_cols = character(), ...) { + if (!is.null(file) && !is.null(text)) { + stop("Provide either file or text, not both.") + } + if (is.null(file) && is.null(text)) { + stop("Provide file or text.") + } + + read_args <- list(...) + if (is.null(read_args$check.names)) { + read_args$check.names <- FALSE + } + if (is.null(read_args$stringsAsFactors)) { + read_args$stringsAsFactors <- FALSE + } + + first_line <- if (!is.null(file)) { + readLines(file, n = 1) + } else { + strsplit(text, "\n", fixed = TRUE)[[1]][1] + } + if (length(first_line) == 0 || !nzchar(first_line)) { + stop("Cannot infer columns from an empty table.") + } + + header <- isTRUE(read_args$header) + sep <- read_args$sep + if (is.null(sep)) { + sep <- "" + } + + if (header) { + preview_df <- read.table( + text = first_line, + header = TRUE, + sep = sep, + check.names = FALSE, + stringsAsFactors = FALSE + ) + col_names <- names(preview_df) + } else { + preview_df <- read.table( + text = first_line, + header = FALSE, + sep = sep, + check.names = FALSE, + stringsAsFactors = FALSE + ) + col_names <- paste0("V", seq_len(ncol(preview_df))) + } + + col_classes <- rep(NA_character_, length(col_names)) + names(col_classes) <- col_names + col_classes[names(col_classes) %in% id_cols] <- "character" + read_args$colClasses <- col_classes + + if (!is.null(file)) { + read_args$file <- file + } else { + read_args$text <- text + } + + do.call(read.table, read_args) +} + ``` @@ -43,12 +108,12 @@ Row ```{r numsnps} #fai file for getting genome length fai_path <- file.path(qc_dir, "ref.fai") -df_fai <- read.table(fai_path, header = F) +df_fai <- read.table(fai_path, header = F, check.names = FALSE, stringsAsFactors = FALSE) genome_size <- sum(df_fai$V2) #summary of the unfiltered VCF sum_path <- paste0(vcf_prefix, ".FILTER.summary") -df_sum <- read.table(sum_path, header =T) +df_sum <- read.table(sum_path, header =T, check.names = FALSE, stringsAsFactors = FALSE) #total number of SNPs found by pipeline first_line <- df_sum %>% filter(FILTER == ".") @@ -65,10 +130,10 @@ tot_snps <- snps_remain + fil_snps proj_name <- basename(qc_dir) #num SNPs in pruned data -nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T)) +nums.nps <- nrow(read.table(paste0(plink_prefix, ".bim"), header = T, check.names = FALSE, stringsAsFactors = FALSE)) #estimate watterson's theta: -num.samples <- nrow(read.table(paste0(vcf_prefix, ".idepth"), header = T)) +num.samples <- nrow(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV"))) #harmonic number calculation for n-1 chromosomes (2*sample size) Hn = 0 @@ -105,7 +170,7 @@ valueBox(div_text, icon = "fa-dna") ### Mean depth ```{r} -mean.depth <- round(mean(read.table(paste0(vcf_prefix, ".idepth"), header = T)$MEAN_DEPTH), 1) +mean.depth <- round(mean(read_table_preserve_ids(paste0(vcf_prefix, ".idepth"), header = T, id_cols = c("INDV"))$MEAN_DEPTH), 1) valueBox(mean.depth, icon = "fa-align-center") ``` @@ -138,14 +203,14 @@ input$GMKey <- params$GMKey pca.path <- paste0(plink_prefix, ".eigenvec") #this makes it reasonably robust to running with plink 1.9 or plink 2.0 tmp.head <- sub("#", "", readLines(pca.path)) -df.pca <- read.table(text = tmp.head, header = TRUE) -df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE) +df.pca <- read_table_preserve_ids(text = tmp.head, header = TRUE, id_cols = c("FID", "IID")) +df.val <- read.table(paste0(plink_prefix, ".eigenval"), header = FALSE, check.names = FALSE, stringsAsFactors = FALSE) df.val$prop <- (df.val$V1 / (sum(df.val$V1))) * 100 df.val$PC <- paste0("PC", row.names(df.val)) #add depth depth.path <- paste0(vcf_prefix, ".idepth") -df.depth <- read.table(depth.path, header = T) +df.depth <- read_table_preserve_ids(depth.path, header = T, id_cols = c("INDV")) df.depth <- df.depth %>% mutate_if(is.numeric, round, digits = 2) df.pca <- left_join(df.pca, df.depth, by = c("IID" = "INDV")) @@ -229,7 +294,7 @@ Row # depth plot -------------------------------------------------------------- #depth is read in the PCA chunk imiss.path <- paste0(vcf_prefix, ".imiss") -df.imiss <- read.table(imiss.path, header =T) +df.imiss <- read_table_preserve_ids(imiss.path, header =T, id_cols = c("INDV", "FID")) df.depth.miss <- inner_join(df.depth, df.imiss, by = "INDV") df.depth.miss <- df.depth.miss %>% @@ -263,7 +328,7 @@ df.depth.miss <- df.depth.miss %>% # bamstat plot -------------------------------------------------------------- # v2 qc_report.tsv uses lowercase column names: sample, percent_mapped, mean_depth, etc. bamstat.path <- file.path(qc_dir, "qc_report.tsv") -df.bamstat <- read.table(bamstat.path, header = T, sep = "\t") +df.bamstat <- read_table_preserve_ids(bamstat.path, header = T, sep = "\t", id_cols = c("sample")) df.depth.miss.bamstat <- inner_join(df.depth.miss, df.bamstat, by = c("INDV" = "sample")) df.depth.miss.bamstat <- df.depth.miss.bamstat %>% @@ -291,7 +356,7 @@ cat("*Mapping rate panel skipped — no BAM summary stats provided (qc_report co ```{r, fig.width = 5, fig.height = 5} het.path <- paste0(vcf_prefix, ".het") -df.het <- read.table(het.path, header =T) +df.het <- read_table_preserve_ids(het.path, header =T, id_cols = c("INDV")) df.het <- left_join(df.het, df.pca, by = c("INDV" = "IID")) @@ -339,10 +404,10 @@ Row ```{r, fig.height=12, fig.width=12} -dist.id <- read.table(paste0(plink_prefix, ".dist.id"))["V2"] +dist.id <- read_table_preserve_ids(paste0(plink_prefix, ".dist.id"), id_cols = c("V1", "V2"))["V2"] dist.id <- left_join(dist.id, df.pca[,c("IID","cluster")], by = c("V2" = "IID")) -df.dist <- read.table(paste0(plink_prefix, ".dist")) +df.dist <- read.table(paste0(plink_prefix, ".dist"), check.names = FALSE, stringsAsFactors = FALSE) mat.dist <- as.dist(df.dist) @@ -400,7 +465,7 @@ Row ```{r, fig.height=10, fig.width=6} -df.rel <- read.table(paste0(plink_prefix, ".king")) +df.rel <- read.table(paste0(plink_prefix, ".king"), check.names = FALSE, stringsAsFactors = FALSE) colnames(df.rel) <- dist.id$V2 rownames(df.rel) <- dist.id$V2 @@ -465,7 +530,7 @@ Row coords.path <- file.path(qc_dir, "coords.txt") if((file.exists(coords.path)) & (file.size(coords.path)>0)){ - df.coords <- read.table(coords.path) + df.coords <- read_table_preserve_ids(coords.path, id_cols = c("V1")) names(df.coords) <- c("sample.ID","long","lat") if(max(df.coords$lat < 90) & min(df.coords$lat) > 0 @@ -537,7 +602,7 @@ coords.path <- file.path(qc_dir, "coords.txt") if(file.exists(coords.path)){ if(!is.null(input$GMKey) && nchar(input$GMKey) > 0){ - df.coords <- read.table(coords.path, na.strings = c("", "nan")) + df.coords <- read_table_preserve_ids(coords.path, na.strings = c("", "nan"), id_cols = c("V1")) names(df.coords) <- c("sample.ID","long","lat") df.coords <- left_join(df.pca, df.coords, by = c("IID" = "sample.ID")) @@ -594,14 +659,14 @@ k2 <- paste0(plink_prefix, ".2.Q") k3 <- paste0(plink_prefix, ".3.Q") samps <- paste0(plink_prefix, ".fam") -x <- read.table(k2, header = F) +x <- read.table(k2, header = F, check.names = FALSE, stringsAsFactors = FALSE) struct_files <- c(k2,k3) cat_admx <- do.call("rbind",lapply(struct_files, FUN=function(files){ - x <- read.table(files, header = F) + x <- read.table(files, header = F, check.names = FALSE, stringsAsFactors = FALSE) names(x) <- gsub("V", "pop", names(x)) #rename ancestral pops - x.samps <- read.table(samps) %>% select(V2) #get sample names from .fam file + x.samps <- read_table_preserve_ids(samps, id_cols = c("V1", "V2")) %>% select(V2) #get sample names from .fam file x$sampleID <- x.samps$V2 #add sample name to df x$k <- gsub(".Q","",substr(files, nchar(files)-3+1, nchar(files))) x.long <- x %>% #pivot longer diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index c37b0327..484fdff7 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -390,7 +390,17 @@ def _parse_mark_duplicates(values, default): return pd.Series(parsed, index=values.index, dtype=bool) -samples_df = pd.read_csv(config["samples"]) +SAMPLE_SHEET_DTYPES = { + "sample_id": "string", + "library_id": "string", +} + +SAMPLE_METADATA_DTYPES = { + "sample_id": "string", +} + + +samples_df = pd.read_csv(config["samples"], dtype=SAMPLE_SHEET_DTYPES) global_mark_duplicates = bool(config["reads"]["mark_duplicates"]) if "library_id" not in samples_df.columns: @@ -685,7 +695,10 @@ def _parse_bool_column(series, column_name): metadata_df = None if config.get("sample_metadata"): - metadata_df = pd.read_csv(config["sample_metadata"]) + metadata_df = pd.read_csv( + config["sample_metadata"], + dtype=SAMPLE_METADATA_DTYPES, + ) validate(metadata_df, Path(workflow.basedir, "schemas/sample_metadata.schema.yaml")) # Validate sample_id values exist in the main sample sheet