From b4c780d33028794e51b1cd75c72ba16b4a66ddce Mon Sep 17 00:00:00 2001 From: Tim Sackton Date: Mon, 13 Apr 2026 17:09:00 -0400 Subject: [PATCH] Filter sparse QC samples before PLINK GRM --- docs/config-fields-supported.md | 3 ++- docs/modules.md | 1 + tests/tests.py | 27 ++++++++++++++++++++++++++ workflow/Snakefile | 1 + workflow/modules/qc/Snakefile | 7 +++++++ workflow/modules/qc/config/config.yaml | 1 + workflow/rules/common.smk | 1 + workflow/schemas/config.schema.yaml | 5 +++++ 8 files changed, 45 insertions(+), 1 deletion(-) diff --git a/docs/config-fields-supported.md b/docs/config-fields-supported.md index f4e68f68..bcfe739b 100644 --- a/docs/config-fields-supported.md +++ b/docs/config-fields-supported.md @@ -51,6 +51,7 @@ These are the supported v2 config keys used by the main workflow. Unknown keys a | `modules.qc.enabled` | boolean | no | `false` | Enable QC module | | `modules.qc.clusters` | integer | no | `3` | `>= 1` | | `modules.qc.min_depth` | number | no | `2` | `>= 0` | +| `modules.qc.max_sample_missingness` | number | no | `0.49` | `0..1` | | `modules.qc.google_api_key` | string | no | `""` | Optional for map panel | | `modules.qc.exclude_scaffolds` | string | no | `""` | Comma-separated scaffold list | | `modules.postprocess.enabled` | boolean | no | `false` | Enable postprocess module | @@ -78,7 +79,7 @@ When running module Snakefiles directly (outside the main workflow module-import ### QC module (`workflow/modules/qc/Snakefile`) -`samples`, `sample_metadata`, `vcf`, `fai`, `qc_report`, `clusters`, `min_depth`, `google_api_key`, `exclude_scaffolds` +`samples`, `sample_metadata`, `vcf`, `fai`, `qc_report`, `clusters`, `min_depth`, `max_sample_missingness`, `google_api_key`, `exclude_scaffolds` ### Postprocess module (`workflow/modules/postprocess/Snakefile`) diff --git a/docs/modules.md b/docs/modules.md index b27e1ba8..1677c3a8 100644 --- a/docs/modules.md +++ b/docs/modules.md @@ -17,6 +17,7 @@ The quality control module aggregates various statistics from the workflow and p |`modules.qc.clusters`| Number of clusters for PCA visualization.| `int`| |`modules.qc.google_api_key`| Google Maps API key for the terrain panel (optional).| `str`| |`modules.qc.min_depth`| Samples with average depth below this will be excluded for QC analysis.| `int`| +|`modules.qc.max_sample_missingness`| Samples with >49% missing genotypes in the pruned QC SNP set are excluded before PLINK PCA/GRM.| `float`| |`modules.qc.exclude_scaffolds`| Comma-separated scaffolds to exclude from QC SNP sampling.| `str`| ```{note} diff --git a/tests/tests.py b/tests/tests.py index b35e3ba2..f7e1f8ed 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1470,6 +1470,33 @@ def test_qc_dashboard_helper_preserves_numeric_like_ids(): assert result.returncode == 0, (result.stdout + result.stderr).strip() +@pytest.mark.dry_run +def test_qc_plink_filters_sparse_samples_before_pca(request): + """QC PLINK step should pass a per-sample missingness filter to avoid NaN GRMs.""" + no_conda = request.config.getoption("--no-conda") + with tempfile.TemporaryDirectory() as tmpdir: + smk = SnakemakeRunner( + Path(tmpdir), + use_conda=not no_conda, + snakefile=WORKFLOW_DIR / "modules" / "qc" / "Snakefile", + ) + result = smk.dry_run( + target="results/qc/plink.bed", + configfile=WORKFLOW_DIR / "modules" / "qc" / "config" / "config.yaml", + config_overrides={ + "samples": str(TEST_DATA_DIR / "qc" / "samples.csv"), + "sample_metadata": str(TEST_DATA_DIR / "qc" / "sample_metadata.csv"), + "vcf": str(TEST_DATA_DIR / "qc" / "raw.vcf.gz"), + "fai": str(TEST_DATA_DIR / "qc" / "ref.fai"), + "max_sample_missingness": "0.49", + }, + ) + result.assert_success() + + output = result.stdout + result.stderr + assert "--mind 0.49" in output + + @pytest.mark.full_run def test_qc_standalone_full_run(request): """Full execution of QC module as standalone workflow against test fixtures.""" diff --git a/workflow/Snakefile b/workflow/Snakefile index bdf2db2f..f8aabc79 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -60,6 +60,7 @@ if config["modules"]["qc"]["enabled"]: "qc_report": "results/qc_metrics/qc_report.tsv", "clusters": config["modules"]["qc"]["clusters"], "min_depth": config["modules"]["qc"]["min_depth"], + "max_sample_missingness": config["modules"]["qc"]["max_sample_missingness"], "google_api_key": config["modules"]["qc"]["google_api_key"], "exclude_scaffolds": config["modules"]["qc"]["exclude_scaffolds"], } diff --git a/workflow/modules/qc/Snakefile b/workflow/modules/qc/Snakefile index 61c3eff2..e7d02aa5 100644 --- a/workflow/modules/qc/Snakefile +++ b/workflow/modules/qc/Snakefile @@ -18,6 +18,9 @@ Config keys (flat): qc_report - path to BAM summary stats TSV (optional; empty = skip mapping rate panel) clusters - number of k-means clusters for PCA visualization (default 3) min_depth - minimum depth to include a sample (default 2) + max_sample_missingness - maximum genotype missingness allowed per sample + in the pruned QC SNP set before PLINK drops it + (default 0.49) google_api_key - Google Maps API key for terrain map (default "") exclude_scaffolds - comma-separated scaffolds to exclude (default "") """ @@ -37,6 +40,7 @@ _QC_DEFAULTS = { "qc_report": "", "clusters": 3, "min_depth": 2, + "max_sample_missingness": 0.49, "google_api_key": "", "exclude_scaffolds": "", } @@ -285,6 +289,7 @@ rule plink: king="results/qc/plink.king", params: prefix="results/qc/plink", + max_sample_missingness=config["max_sample_missingness"], conda: "envs/plink.yml" log: @@ -292,9 +297,11 @@ rule plink: shell: """ plink2 --vcf {input.vcf} --pca 10 --out {params.prefix} \ + --mind {params.max_sample_missingness} \ --allow-extra-chr --autosome-num 95 --make-bed --make-king square \ --const-fid --bad-freqs &> {log} plink --vcf {input.vcf} --out {params.prefix} \ + --mind {params.max_sample_missingness} \ --allow-extra-chr --autosome-num 95 --distance square \ --const-fid &>> {log} """ diff --git a/workflow/modules/qc/config/config.yaml b/workflow/modules/qc/config/config.yaml index 2e210f0d..45deefdd 100644 --- a/workflow/modules/qc/config/config.yaml +++ b/workflow/modules/qc/config/config.yaml @@ -11,5 +11,6 @@ qc_report: "" # optional: path to BAM summary stats TSV (empty clusters: 3 # k-means clusters for PCA visualization min_depth: 2 # minimum depth to include a sample +max_sample_missingness: 0.49 # drop sparse samples before PLINK PCA/GRM google_api_key: "" # Google Maps API key for terrain map exclude_scaffolds: "" # comma-separated scaffolds to exclude diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 484fdff7..82caa88d 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -103,6 +103,7 @@ DEFAULTS = { "enabled": False, "clusters": 3, "min_depth": 2, + "max_sample_missingness": 0.49, "google_api_key": "", "exclude_scaffolds": "", }, diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 6c4b9a4e..133a31e6 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -235,6 +235,11 @@ properties: type: number minimum: 0 default: 2 + max_sample_missingness: + type: number + minimum: 0 + maximum: 1 + default: 0.49 google_api_key: type: string default: ""