Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,3 @@ Other resources, such as `slurm_partition`, `runtime`, etc. can also be set here
```{note}
Snakemake allows you to dynamically assign resources. We use the `attempt` keyword to specify memory. For example. `attempt * 2000` will provide 2GB on the first attempt of the rule, if the rule fails (out of memory) then on the second attempt it will be provided 4GB. This behavior requires the `-T/--retries` Snakemake option.
```

111 changes: 108 additions & 3 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import shutil
import socket
import subprocess
import tempfile
import threading
from contextlib import contextmanager
Expand Down Expand Up @@ -266,6 +267,36 @@ def get_vcf_contig_headers(path):
return contigs


def extract_r_function_source(path, function_name):
"""Extract an R function body from an Rmd file by balanced braces."""
text = Path(path).read_text()
marker = f"{function_name} <- function"
start = text.find(marker)
if start == -1:
raise AssertionError(f"Function '{function_name}' not found in {path}")

brace_start = text.find("{", start)
if brace_start == -1:
raise AssertionError(f"Could not find opening brace for '{function_name}' in {path}")

depth = 0
end = None
for idx in range(brace_start, len(text)):
char = text[idx]
if char == "{":
depth += 1
elif char == "}":
depth -= 1
if depth == 0:
end = idx + 1
break

if end is None:
raise AssertionError(f"Could not find closing brace for '{function_name}' in {path}")

return text[start:end]


def write_reference_source_config(base_config, out_dir, *, source):
"""Write a config copy with reference.source overridden."""
text = Path(base_config).read_text()
Expand All @@ -291,6 +322,16 @@ def write_gvcf_sample_sheet(out_dir, *, sample_id, gvcf_path):
return out_path


def write_fastq_sample_sheet(out_dir, *, sample_id, library_id):
"""Write a one-row FASTQ sample sheet with explicit sample and library IDs."""
out_path = Path(out_dir) / "numeric_id_fastqs.csv"
out_path.write_text(
"sample_id,input_type,input,library_id,mark_duplicates\n"
f"{sample_id},fastq,tests/data/fastq/sample1_1.fastq.gz;tests/data/fastq/sample1_2.fastq.gz,{library_id},true\n"
)
return out_path


@contextmanager
def serve_directory(directory):
"""Serve a directory over HTTP for reference URL tests."""
Expand Down Expand Up @@ -698,7 +739,8 @@ def test_reference_url_sources(request, compressed):

@pytest.mark.full_run
@pytest.mark.parametrize("intervals_enabled", [False, True])
def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled):
@pytest.mark.parametrize("sample_id", ["sample_gvcf", "00123"])
def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_enabled, sample_id):
no_conda = request.config.getoption("--no-conda")
with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
Expand All @@ -715,7 +757,7 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
cfg = write_intervals_config(cfg, tmpdir, enabled=intervals_enabled)
samples = write_gvcf_sample_sheet(
tmpdir,
sample_id="sample_gvcf",
sample_id=sample_id,
gvcf_path=gvcf_path,
)

Expand All @@ -727,7 +769,26 @@ def test_create_db_mapfile_preserves_external_gvcf_sample_id(request, intervals_
result.assert_success()

mapfile = (tmp_path / "results/genomics_db/mapfile.txt").read_text().strip()
assert mapfile == f"sample_gvcf\t{gvcf_path}"
assert mapfile == f"{sample_id}\t{gvcf_path}"


@pytest.mark.dry_run
def test_fastq_dry_run_accepts_numeric_like_sample_and_library_ids(request):
no_conda = request.config.getoption("--no-conda")
with tempfile.TemporaryDirectory() as tmpdir:
smk = SnakemakeRunner(Path(tmpdir), use_conda=not no_conda)
samples = write_fastq_sample_sheet(tmpdir, sample_id="00123", library_id="456")

result = smk.dry_run(
target=[
"results/filtered_fastqs/00123/456/u1_1.fastq.gz",
"results/filtered_fastqs/00123/456/u1_2.fastq.gz",
],
configfile=get_config_file(),
samples=samples,
)

result.assert_success()


@pytest.mark.full_run
Expand Down Expand Up @@ -1365,6 +1426,50 @@ def test_write_numeric_qc_inputs_rewrites_contigs():
assert fai_lines[:2] == ["1\t1\t3", "2\t1\t3"]


def test_qc_dashboard_helper_preserves_numeric_like_ids():
"""QC dashboard helper should preserve leading zeros for headered and headerless tables."""
if shutil.which("Rscript") is None:
pytest.skip("Rscript is not available")

helper_source = extract_r_function_source(
WORKFLOW_DIR / "modules" / "qc" / "scripts" / "qc_dashboard_interactive.Rmd",
"read_table_preserve_ids",
)

with tempfile.TemporaryDirectory() as tmpdir:
tmp_path = Path(tmpdir)
headerless = tmp_path / "dist.id"
headerless.write_text("0 000123\n0 000456\n")
headered = tmp_path / "depth.tsv"
headered.write_text("INDV\tMEAN_DEPTH\n000123\t5.2\n")

script = tmp_path / "validate_read_table_preserve_ids.R"
script.write_text(
"\n".join(
[
"args <- commandArgs(trailingOnly = TRUE)",
"headerless <- args[[1]]",
"headered <- args[[2]]",
helper_source,
"headerless_df <- read_table_preserve_ids(headerless, id_cols = c('V1', 'V2'))",
"if (!is.character(headerless_df$V1) || !is.character(headerless_df$V2)) stop('Headerless ID columns were not read as character')",
"if (!identical(headerless_df$V2[[1]], '000123')) stop('Headerless leading zeros were not preserved')",
"headered_df <- read_table_preserve_ids(headered, header = TRUE, sep = '\\t', id_cols = c('INDV'))",
"if (!is.character(headered_df$INDV)) stop('Headered ID column was not read as character')",
"if (!identical(headered_df$INDV[[1]], '000123')) stop('Headered leading zeros were not preserved')",
]
)
)

result = subprocess.run(
["Rscript", str(script), str(headerless), str(headered)],
capture_output=True,
text=True,
check=False,
)
assert result.returncode == 0, (result.stdout + result.stderr).strip()


@pytest.mark.full_run
def test_qc_standalone_full_run(request):
"""Full execution of QC module as standalone workflow against test fixtures."""
Expand Down
2 changes: 1 addition & 1 deletion tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,4 +375,4 @@ def test_coverage_bed(request):
fields = line.split("\t")
assert len(fields) == 3, f"BED line should have 3 fields: {line}"
assert fields[0] == "chr2l", f"Expected contig chr2l: {line}"
assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"
assert int(fields[1]) < int(fields[2]), f"Start should be < end: {line}"
14 changes: 12 additions & 2 deletions workflow/modules/postprocess/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,23 @@ for _k, _v in _PP_DEFAULTS.items():

# ---------- Sample sheet + metadata ------------------------------------------

_samples_df = pd.read_csv(config["samples"])
_SAMPLE_SHEET_DTYPES = {
"sample_id": "string",
"library_id": "string",
}

_SAMPLE_METADATA_DTYPES = {
"sample_id": "string",
}


_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
_ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()

_metadata_df = None
_meta_path = config.get("sample_metadata", "")
if _meta_path:
_metadata_df = pd.read_csv(_meta_path)
_metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
Comment on lines +51 to +67
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) will raise if library_id is absent, but library_id is optional in the samples schema and this module only uses sample_id. To avoid breaking valid sample sheets, either only force sample_id to string here, or compute dtype by intersecting _SAMPLE_SHEET_DTYPES with the columns present in the CSV header before calling read_csv.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot apply changes based on this feedback we should intersect dtype map with parsed header



def _parse_bool(series, col_name):
Expand Down
14 changes: 12 additions & 2 deletions workflow/modules/qc/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,23 @@ for _k, _v in _QC_DEFAULTS.items():

# ---------- Sample sheet + metadata ------------------------------------------

_samples_df = pd.read_csv(config["samples"])
_SAMPLE_SHEET_DTYPES = {
"sample_id": "string",
"library_id": "string",
}

_SAMPLE_METADATA_DTYPES = {
"sample_id": "string",
}


_samples_df = pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES)
_ALL_SAMPLES = _samples_df["sample_id"].unique().tolist()

_metadata_df = None
_meta_path = config.get("sample_metadata", "")
if _meta_path:
_metadata_df = pd.read_csv(_meta_path)
_metadata_df = pd.read_csv(_meta_path, dtype=_SAMPLE_METADATA_DTYPES)
Comment on lines +51 to +67
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pd.read_csv(config["samples"], dtype=_SAMPLE_SHEET_DTYPES) will error when the samples CSV does not include library_id (dtype mappings require all specified columns to be present). This module only needs sample_id, and some fixtures (e.g. tests/data/qc/samples.csv) are headerless aside from sample_id, so this change is likely to break the QC module. Suggest intersecting the dtype map with the parsed header (or only specifying {"sample_id": "string"} here).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot apply changes based on this feedback. we should intersect dtype map with parsed header

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in commit 2d39d2d. Added a _read_csv_safe helper in both workflow/modules/qc/Snakefile and workflow/modules/postprocess/Snakefile that reads the CSV header first (nrows=0, zero data rows so essentially free), then filters the dtype map to only the columns actually present before doing the full read. This avoids the ValueError when library_id is absent while still preserving leading-zero IDs at parse time.



def _has_coords():
Expand Down
Loading
Loading