diff --git a/.nf-core.yml b/.nf-core.yml index 0f557db5..061ecb02 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,29 +1,29 @@ repository_type: pipeline lint: files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-sash_logo_light.png - - docs/images/nf-core-sash_logo_light.png - - docs/images/nf-core-sash_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml - - .github/ISSUE_TEMPLATE/bug_report.yml - - .github/workflows/branch.yml - - .github/workflows/ci.yml - - .github/workflows/linting_comment.yml - - .github/workflows/linting.yml - - conf/igenomes.config + - CODE_OF_CONDUCT.md + - assets/nf-core-sash_logo_light.png + - docs/images/nf-core-sash_logo_light.png + - docs/images/nf-core-sash_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml + - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/workflows/branch.yml + - .github/workflows/ci.yml + - .github/workflows/linting_comment.yml + - .github/workflows/linting.yml + - conf/igenomes.config nextflow_config: - - manifest.name - - manifest.homePage - - process.cpus - - process.memory - - process.time - - custom_config + - manifest.name + - manifest.homePage + - process.cpus + - process.memory + - process.time + - custom_config multiqc_config: - - report_comment + - report_comment files_unchanged: - - .github/ISSUE_TEMPLATE/bug_report.yml + - .github/ISSUE_TEMPLATE/bug_report.yml readme: - - nextflow_badge + - nextflow_badge diff --git a/CHANGELOG.md b/CHANGELOG.md index 21dc74a4..5c30a61c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,29 @@ Initial release of umccr/sash, created with the [nf-core](https://nf-co.re/) tem ### `Deprecated` +## [0.7.0] - Unreleased + +### Added + +- Reference-data preparation step that maps and auto-extracts bundled resources (PCGR/VEP tarballs) from `--ref_data_path`. (see PR #38) +- Sigrap HRDetect and MutationalPatterns now run as standalone stages; CHORD predictions from oncoanalyser plus HRDetect/MutPat outputs are integrated into the cancer report. (see PR #36) +- Somatic VCFs are exported as MAF files via `vcf2maf` for downstream compatibility. +- Optional `--pcgr_variant_chunk_size` parameter to tune PCGR chunking in BOLT somatic annotation. + +### Changed + +- BOLT updated to `0.3.0-dev-20` (and GPGR report image to `0.3.0-dev-14`), including PCGR HTML renamed to `.pcgr.grch38.html` and requiring an explicit VEP cache path. +- PCGR reference bundle refreshed to `20250314` and resolved from tarballs alongside the GRCh38 VEP cache. +- DRAGEN input directories are handled as file objects to better support staged/symlinked inputs. (see PR #12) + +### Dependencies + +| Tool | Old | New | +|------|-----|-----| +| bolt | 0.2.17 | 0.3.0-dev-20 | +| pcgr data | 20220203 | 20250314 | +| sigrap | — | 0.2.0-dev-7 | + ## [0.6.3] - 2025-10-07 ### Added @@ -113,9 +136,9 @@ Initial release of umccr/sash, created with the [nf-core](https://nf-co.re/) tem ### Dependencies -| Tool | Old | New | -|------|-----|-----| -| Linx | 1.25 | 2.0 | -| Purple | 4.0.1 | 4.1.0 | -| Bolt | — | umccr/bolt#6 | -| GPGR | — | umccr/gpgr#88 | +| Tool | Old | New | +| ------ | ----- | ------------- | +| Linx | 1.25 | 2.0 | +| Purple | 4.0.1 | 4.1.0 | +| Bolt | — | umccr/bolt#6 | +| GPGR | — | umccr/gpgr#88 | diff --git a/README.md b/README.md index 6c534c11..0c729e78 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ functionality. ## Table of contents -* [Summary](#summary) -* [Requirements](#requirements) -* [Usage](#usage) +- [Summary](#summary) +- [Requirements](#requirements) +- [Usage](#usage) ## Summary @@ -57,8 +57,19 @@ nextflow run scwatts/sash \ --outdir output/ ``` +For detailed instructions, see [docs/usage.md](docs/usage.md). + +## Documentation + +The `sash` pipeline comes with documentation in the `docs/` directory: + +* [Usage](docs/usage.md): Detailed instructions on how to run the pipeline. +* [Output](docs/output.md): Description of the output files and reports. +* [Details](docs/details.md): In-depth explanation of the pipeline steps and tools. +* [ADR](docs/adr.md): Architectural Decision Records. + ## Citations You can cite a specific version of `sash` from the Zenodo record [10.5281/zenodo.15833492](https://doi.org/10.5281/zenodo.15833492) such as: -> Watts, S. C., Savelyev, V., Diakumis, P., Clayssen, Q., Mitchell, H., & Hofmann, O. (2025). umccr/sash: 0.6.0 (0.6.0). Zenodo. https://doi.org/10.5281/zenodo.15833493 +> Watts, S. C., Savelyev, V., Diakumis, P., Clayssen, Q., Mitchell, H., & Hofmann, O. (2025). umccr/sash: 0.6.0 (0.6.0). Zenodo. diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 8e141d97..5adeb527 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1 +1,4 @@ -id,subject_name,sample_name,sample_type,filetype,filepath +id,subject_name,sample_name,filetype,filepath +subject_a.example,subject_a,sample_germline,dragen_germline_dir,/path/to/dragen_germline/ +subject_a.example,subject_a,sample_somatic,dragen_somatic_dir,/path/to/dragen_somatic/ +subject_a.example,subject_a,sample_somatic,oncoanalyser_dir,/path/to/oncoanalyser/ diff --git a/conf/modules.config b/conf/modules.config index 0dc483aa..730e5af5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -57,7 +57,7 @@ process { if (fp.equals('versions.yml')) { return null - } else if (fp.matches('output/.*.pcgr_acmg.grch38.html')) { + } else if (fp.matches('output/.*.pcgr.grch38.html')) { return "${meta.key}/${meta.tumor_id}.pcgr.html" } else { def fp_out = fp.replaceFirst(/output\//, '') @@ -126,7 +126,30 @@ process { publishDir = [ path: { "${params.outdir}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/${filename}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/${filename}" } + ] + } + + + withName: 'SIGRAP_HRDETECT' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/sigrap/hrdetect/${filename}" } + ] + } + + withName: 'SIGRAP_MUTPAT' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.equals('versions.yml')) { + return null + } else { + return "${meta.key}/sigrap/${filename}" + } + } ] } @@ -212,4 +235,12 @@ process { ] } + withName: 'VCF2MAF' { + publishDir = [ + path: { "${params.outdir}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : "${meta.key}/vcf2maf/${filename}" }, + ] + } + } diff --git a/conf/refdata.config b/conf/refdata.config index 6bef3738..f57e5cf2 100644 --- a/conf/refdata.config +++ b/conf/refdata.config @@ -4,7 +4,7 @@ params { umccr_reference_data = '2--0' hmf_reference_data = 'hmf_pipeline_resources.38_v2.2.0--3' - pcgr = '20220203' + pcgr = '20250314' snpeff = '5_1' oncokb = '4.12' @@ -20,7 +20,8 @@ params { } miscdata_paths { - pcgr_dir = "databases/pcgr/v${params.data_versions.pcgr}/" + pcgr_dir = "databases/pcgr/pcgr_ref_data.${params.data_versions.pcgr}.grch38.tgz" + vep_dir = "databases/pcgr/homo_sapiens_vep_113_GRCh38.tar.gz" snpeff_dir = "databases/snpeff/v${params.data_versions.snpeff}/" oncokb_genes = "databases/oncokb/v${params.data_versions.oncokb}/cancerGeneList.txt" diff --git a/docs/README.md b/docs/README.md index 5ec0f466..c03d035a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,8 +1,8 @@ -# umccr/sash: Documentation - -The umccr/sash documentation is split into the following pages: - +- [Details](details.md) + - In details of the pipeline steps - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. - [Output](output.md) - An overview of the different results produced by the pipeline and how to interpret them. +- [Architectural decision record (ADR)](adr.md) + - describes a choice the team makes about a significant aspect of the software architecture they're planning to build \ No newline at end of file diff --git a/docs/adr.md b/docs/adr.md new file mode 100644 index 00000000..b1a7ea88 --- /dev/null +++ b/docs/adr.md @@ -0,0 +1,44 @@ +# ADR #1: Implement VCF Chunking and Parallelization in Sash Workflow for PCGR Processing + +**Status**: In Progress +**Date**: 2024-11-07 +**Deciders**: Oliver Hofmann, Stephen Watts, Quentin Clayssen +**Technical Story**: Based on the limitations of PCGR in handling large variant datasets within the sash workflow, specifically impacting hypermutated samples. + +## Context +[PCGR](https://sigven.github.io/pcgr/) (Personal Cancer Genome Reporter) currently has a variant processing limit of 500,000 variants per run. In the sash workflow, hypermutated samples often exceed this variant limit. PCGR has its own filtering steps, but an additional filtering step was also introduced in Bolt. By using VCF chunking and parallel processing, we can ensure that these large datasets are analyzed effectively without exceeding the PCGR variant limit, leading to larger annotation and a more scalable pipeline. + +## Decision +To address the limitations of PCGR when handling hypermutated samples, we WILL implement the following: + +1. **Split VCF Files into Chunks**: Input VCF files MUST be divided into chunks, each containing no more than 500,000 variants. This ensures that each chunk remains within PCGR’s processing capacity. + +2. **Parallelize Processing**: Each chunk MUST be processed concurrently through PCGR to optimize processing time. The annotated outputs from all chunks MUST be merged to create a unified dataset. + +3. **Integrate into Bolt Annotation**: The chunking and parallelization changes MUST be implemented in the Bolt annotation module to ensure seamless and scalable processing for large variant datasets. + +4. **Efficiency Consideration**: For now, there MAY be a loss of efficiency for larger variant sets due to the fixed resources allocated for annotation. Further resource adjustments SHOULD be evaluated in the future. + +## Consequences + +### Positive Consequences +- **Improved Efficiency**: This approach allows large variant datasets to be processed within PCGR's constraints, enhancing efficiency and ensuring more comprehensive analysis. +- **Scalability**: Chunking and parallel processing make the sash workflow more scalable for hypermutated samples, accommodating larger datasets. + +### Negative Consequences +- **Complexity**: Adding chunking and merging processes WILL increase complexity in data handling and ensuring integrity across all merged data. +- **Resource Demand**: Parallel processing MAY increase resource consumption, affecting system performance and requiring further resource management. + +## Remaining Challenges +While the proposed approach mitigates the current limitations of PCGR, it MAY not fully resolve the issues for hypermutated samples with exceptionally high variant counts. Additional solutions MUST be explored, such as: + +- **Additional Filtering Criteria**: Applying additional filters to reduce the variant count where applicable. +- **Alternative Reporting Methods**: Exploring more scalable reporting approaches that COULD handle higher variant loads. + +## Status +**Status**: In Progress + +## Links +- [Related PR for VCF Chunking and Parallelization Implementation](https://github.com/scwatts/bolt/pull/2) +- [PCGR Documentation on Variant Limit](https://sigven.github.io/pcgr/articles/running.html#large-input-sets-vcf) +- Discussion on Hypermutated Samples Handling diff --git a/docs/details.md b/docs/details.md new file mode 100644 index 00000000..1e0180ab --- /dev/null +++ b/docs/details.md @@ -0,0 +1,588 @@ +# sash workflow details + +## Table of Contents +- [Overview](#overview) +- [HMFtools WiGiTs](#hmftools-wigits) +- [Other Tools](#other-tools) +- [Pipeline Inputs](#pipeline-inputs) +- [Workflows](#workflows) + - [Somatic Small Variants](#somatic-small-variants) + - [Somatic Structural Variants](#somatic-structural-variants) + - [Germline Small Variants](#germline-small-variants) +- [Common Reports](#common-reports) +- [sash Module Outputs](#sash-module-outputs) +- [Coverage](#coverage) +- [Reference Data](#reference-data) +- [FAQ](#faq) + +## Overview + +![Summary](images/sash_overview_qc.png) + +The sash Workflow is a genomic analysis framework comprising three primary pipelines: + +- Somatic Small Variants (SNV somatic): Detects single nucleotide variants (SNVs) and indels in tumor samples, emphasizing clinical relevance. +- Somatic Structural Variants (SV somatic): Identifies large-scale genomic alterations (deletions, duplications, etc.) and integrates copy number data. +- Germline Variants (SNV germline): Focuses on inherited variants linked to cancer predisposition. + +These pipelines utilize Bolt (a Python package designed for modular processing) and leverage outputs from the [DRAGEN](https://sapac.illumina.com/products/by-type/informatics-products/dragen-secondary-analysis.html) Variant Caller alongside the [Hartwig Medical Foundation (HMF) tools](https://github.com/hartwigmedical/hmftools/tree/master) integrated via [Oncoanalyser](https://github.com/nf-core/oncoanalyser). Each pipeline is tailored to a specific type of genomic variant, incorporating filtering, annotation and HTML reports for research and curation. + +--- + +## HMFtools + +HMFtools is an open-source suite for cancer genomics developed by the Hartwig Medical Foundation. Key components used in sash include: + +- [SAGE (Somatic Alterations in Genome)](https://github.com/hartwigmedical/hmftools/blob/master/sage/README.md): + A tiered SNV/indel caller targeting cancer hotspots from databases including [Cancer Genome Interpreter](https://www.cancergenomeinterpreter.org/home), [CIViC](http://civic.genome.wustl.edu/), and [OncoKB](https://oncokb.org/) to recover low-frequency variants missed by DRAGEN. Outputs a VCF with confidence tiers (hotspot, panel, high/low confidence). + +- [PURPLE](https://github.com/hartwigmedical/hmftools/tree/master/purple): + Estimates tumor purity (tumor cell fraction) and ploidy (average copy number), integrates copy number data, and calculates TMB (tumor mutation burden) and MSI (microsatellite instability). + +- [Cobalt](https://github.com/hartwigmedical/hmftools/blob/master/cobalt/README.md): + Calculates read-depth ratios from sequencing data, providing essential input for copy number analysis. Its outputs are used by PURPLE to generate accurate copy number profiles across the genome. + +- [Amber](https://github.com/hartwigmedical/hmftools/blob/master/amber/README.md): + Computes B-allele frequencies, which are critical for estimating tumor purity and ploidy. The Amber directory contains these measurements, supporting PURPLE's analysis. + +--- + +## Other Tools + +### [SIGRAP](https://github.com/umccr/sigrap) +A framework for running PCGR and other genomic reporting tools. + +### [Personal Cancer Genome Reporter (PCGR)](https://github.com/sigven/pcgr/tree/v1.4.1) +Tool for comprehensive clinical interpretation of somatic variants, providing tiered classifications and extensive annotation. + +### [Cancer Predisposition Sequencing Report (CPSR)](https://github.com/sigven/cpsr) +Tool for predisposition variant analysis and reporting in germline samples. + +### [Genomics Platform Group Reporting (GPGR)](https://github.com/umccr/gpgr) +UMCCR-developed R package for generating cancer genomics reports. + +### [Linx](https://github.com/hartwigmedical/hmftools/tree/master/linx) +Tool for structural variant annotation and visualization to classify complex rearrangements. + +### [ESVEE](https://github.com/hartwigmedical/hmftools/tree/master/esvee) +Esvee is a structural variant caller optimised for short read sequencing that identifies somatic and germline somatic rearrangements. + +### [VIRUSBreakend](https://github.com/PapenfussLab/gridss/blob/master/VIRUSBreakend_Readme.md) +Tool for detecting viral integration events in human genome sequencing data. + +--- + +## Pipeline Inputs + +### DRAGEN +- `{tumor_id}.hard-filtered.vcf.gz`: Somatic variant calls from DRAGEN pipeline. +- Optional: `${tumor_id}.hrdscore.csv` homologous recombination deficiency scores (surfaced in the cancer report when present). + +### Oncoanalyser + +#### [ESVEE](https://github.com/hartwigmedical/hmftools/tree/master/esvee) +- `${tumor_id}.esvee.ref_depth.vcf.gz` and the accompanying `esvee/` directory: depth and preparation files used to seed eSVee structural variant calling. + +#### [SAGE](https://github.com/hartwigmedical/hmftools/blob/master/sage/README.md) +- `{tumor_id}.sage.somatic.vcf.gz`: Somatic SNV/indel calls from SAGE. + +#### [VIRUSBreakend](https://github.com/PapenfussLab/gridss/blob/master/VIRUSBreakend_Readme.md) +- Directory: `virusbreakend/`: Contains outputs from VIRUSBreakend, used for detecting viral integration events. + +#### [Cobalt](https://github.com/hartwigmedical/hmftools/blob/master/cobalt/README.md) +- Directory: `cobalt/`: Contains read-depth ratio data required for copy number analysis by PURPLE. + +#### [Amber](https://github.com/hartwigmedical/hmftools/blob/master/amber/README.md) +- Directory: `amber/`: Contains B-allele frequency measurements used by PURPLE to estimate tumor purity and ploidy. + +#### CHORD +- File: `chord/{tumor_id}.chord.prediction.tsv` (optional): HRD predictions generated by oncoanalyser; incorporated into the cancer report when present. + +--- + +## Workflows + +### Somatic Small Variants + +#### General +In the Somatic Small Variants workflow, variant detection is performed using the DRAGEN Variant Caller and Oncoanalyser (relying on SAGE and PURPLE outputs). It's structured into four steps: Re-calling, Annotation, Filter, and Report. The final outputs include an HTML report summarizing the results. + +#### Summary +1. Re-calling SAGE variants to recover low-frequency mutations in hotspots. +2. Annotate variants with clinical and functional information using PCGR. +3. Filter variants based on quality and frequency criteria, while retaining those of potential clinical significance. +4. Generate comprehensive HTML reports (PCGR, Cancer Report, LINX, MultiQC). + +### Variant Calling Re-calling + +The variant calling re-calling step uses variants from [SAGE](https://github.com/hartwigmedical/hmftools/tree/master/sage), which is more sensitive than DRAGEN in detecting variants, particularly those with low allele frequency. SAGE focuses on cancer hotspots, prioritizing predefined genomic regions of high clinical or biological relevance with its [filtering system](https://github.com/hartwigmedical/hmftools/tree/master/sage#6-soft-filters). This enables the re-calling of biologically significant variants that may have been missed otherwise. + +#### Inputs +- From DRAGEN: Somatic small variant caller VCF + - `${tumor_id}.main.dragen.vcf.gz` +- From Oncoanalyser: SAGE VCF + - `${tumor_id}.main.sage.filtered.vcf.gz` + + Filtered on chromosomes 1-22, X, Y, and M. + +#### Output +- Re-calling: VCF + - `${tumor_id}.rescued.vcf.gz` + +#### Steps +1. Select High-Confidence SAGE Calls in Hotspot Regions: + - Filter the SAGE output to retain only variants that pass quality filters and overlap with known hotspot regions. + - Compare the input VCF and the SAGE VCF to identify overlapping and unique variants. +2. Annotate existing somatic variant calls also present in the SAGE calls in the input VCF: + - For each variant in the input VCF, check if it exists in the SAGE existing calls. + - For variants integrated by SAGE: + - If `SAGE FILTER=PASS` and input VCF `FILTER=PASS`: + - Set `INFO/SAGE_HOTSPOT` to indicate the variant is called by SAGE in a hotspot. + - If `SAGE FILTER=PASS` and input VCF `FILTER` is not `PASS`: + - Set `INFO/SAGE_HOTSPOT` and `INFO/SAGE_RESCUE` to indicate the variant is re-called from SAGE. + - Update `FILTER=PASS` to include the variant in the final analysis. + - If `SAGE FILTER` is not `PASS`: + - Append `SAGE_lowconf` to the `FILTER` field to flag low-confidence variants. + - Transfer SAGE `FORMAT` fields to the input VCF with a `SAGE_` prefix. +3. Combine annotated input VCF with novel SAGE calls: + - Prepare novel SAGE calls. For each variant in the SAGE VCF missing from the input VCF: + - Rename certain `FORMAT` fields in the novel SAGE VCF to avoid namespace collisions: + - For example, `FORMAT/SB` is renamed to `FORMAT/SAGE_SB`. + - Retain necessary `INFO` and `FORMAT` annotations while removing others to streamline the data. + +### Annotation + +The Annotation process employs Reference Sources (GA4GH/GIAB problem region stratifications, GIAB high confidence regions, gnomAD, Hartwig hotspots), UMCCR panel of normals (built from approximately 200 normal samples), and the PCGR tool to enrich variants with [classification](https://sigven.github.io/pcgr/articles/variant_classification.html) and clinical information. +**These annotations are used to decide which variants are retained or filtered in the next step.** + +#### Inputs +- Small variant VCF + - `${tumor_id}.rescued.vcf.gz` + +#### Output +- Annotated VCF + - `${tumor_id}.annotations.vcf.gz` + +#### Steps +1. Set FILTER to "PASS" for unfiltered variants: + - Iterate over the input VCF file and set the `FILTER` field to `PASS` for any variants that currently have no filter status (`FILTER` is `.` or `None`). +2. Annotate the VCF against reference sources: + - Use vcfanno to add annotations to the VCF file: + - gnomAD (version 2.1) + - Hartwig Hotspots + - ENCODE Blacklist + - Genome in a Bottle High-Confidence Regions (v4.2.1) + - Low and High GC Regions (< 30% or > 65% GC content, compiled by GA4GH) + - Bad Promoter Regions (compiled by GA4GH) +3. Annotate with UMCCR panel of normals counts: + - Use vcfanno and bcftools to annotate the VCF with counts from the UMCCR panel of normals. +4. Standardize the VCF fields: + - Add new `INFO` fields for use with PCGR: + - `TUMOR_AF`, `NORMAL_AF`: Tumor and normal allele frequencies. + - `TUMOR_DP`, `NORMAL_DP`: Tumor and normal read depths. + - Add the `AD` FORMAT field: + - `AD`: Allelic depths for the reference and alternate alleles. +5. Prepare VCF for PCGR annotation: + - Make minimal VCF header keeping only INFO AF/DP, and contigs size. + - Move tumor and normal `FORMAT/AF` and `FORMAT/DP` annotations to the `INFO` field as required by PCGR. + - Set `FILTER` to `PASS` and remove all `FORMAT` and sample columns. +6. Run PCGR (v1.4.1) to annotate VCF against external sources: + - Classify variants by tiers based on annotations and functional impact according to AMP/ASCO/CAP guidelines. + - Add `INFO` fields into the VCF: `TIER`, `SYMBOL`, `CONSEQUENCE`, `MUTATION_HOTSPOT`, `TCGA_PANCANCER_COUNT`, `CLINVAR_CLNSIG`, `ICGC_PCAWG_HITS`, `COSMIC_CNT`. + - External sources include VEP, ClinVar, COSMIC, TCGA, ICGC, Open Targets Platform, CancerMine, DoCM, CBMDB, DisGeNET, Cancer Hotspots, dbNSFP, UniProt/SwissProt, Pfam, DGIdb, and ChEMBL. +7. Transfer PCGR annotations to the full set of variants: + - Merge the PCGR annotations back into the original VCF file. + - Ensure that all variants, including those not selected for PCGR annotation, have relevant clinical annotations where available. + - Preserve the `FILTER` statuses and other annotations from the original VCF. + +### Filter + +The Filter step applies a series of stringent filters to somatic variant calls in the VCF file, ensuring the retention of high-confidence and biologically meaningful variants. + +#### Inputs +- Annotated VCF + - `${tumor_id}.annotations.vcf.gz` + +#### Output +- Filtered VCF + - `${tumor_id}*filters_set.vcf.gz` + +#### Filters + +Variants that do not meet these criteria will be filtered out unless they qualify for [Clinical Significance Exceptions](#clinical-significance-exceptions): + +| **Filter Type** | **Threshold/Criteria** | +|-------------------------------------------|------------------------------------------------| +| **Allele Frequency (AF) Filter** | Tumor AF < 10% (0.10) | +| **Allele Depth (AD) Filter** | Fewer than 4 supporting reads (6 in low-complexity regions) | +| **Non-GIAB AD Filter** | Stricter thresholds outside GIAB high-confidence regions | +| **Problematic Genomic Regions Filter** | Overlap with ENCODE blacklist, bad promoter, or low-complexity regions | +| **Population Frequency (gnomAD) Filter** | gnomAD AF ≥ 1% (0.01) | +| **Panel of Normals (PoN) Germline Filter**| Present in ≥ 5 normal samples or PoN AF > 20% (0.20) | + +#### Clinical Significance Exceptions + +| Exception Category | Criteria | +|-----------------------------------|-------------------------------------------------------------------------| +| **Reference Database Hit Count** | COSMIC count ≥10 OR TCGA pan-cancer count ≥5 OR ICGC PCAWG count ≥5 | +| **ClinVar Pathogenicity** | ClinVar classification of `conflicting_interpretations_of_pathogenicity`, `likely_pathogenic`, `pathogenic`, or `uncertain_significance` | +| **Mutation Hotspots** | Annotated as `HMF_HOTSPOT`, `PCGR_MUTATION_HOTSPOT`, or SAGE Hotspots (CGI, CIViC, OncoKB) | +| **PCGR Tier Exception** | Classified as `TIER_1` OR `TIER_2` | + +### Reports + +The Report step utilizes the Personal Cancer Genome Reporter (PCGR) and other tools to generate comprehensive reports. + +#### Inputs +- Purple purity data +- Filtered VCF + - `${tumor_id}*filters_set.vcf.gz` +- DRAGEN VCF + - `${tumor_id}.main.dragen.vcf.gz` + +#### Output +- PCGR Cancer report + - `${tumor_id}.pcgr.grch38.html` + +#### Steps +1. Generate BCFtools Statistics on the Input VCF: + - Run `bcftools stats` to gather statistics on variant quality and distribution. +2. Calculate Allele Frequency Distributions: + - Filter and normalize variants according to high-confidence regions. + - Extract allele frequency data from tumor samples. + - Produce both a global allele frequency summary and a subset of allele frequencies restricted to key cancer genes. +3. Compare Variant Counts From Two Variant Sets (DRAGEN vs. BOLT): + - Count the total number and types of variants (SNPs, Indels, Others) passing filters in both the DRAGEN VCF and the Filtered BOLT VCF. +4. Count Variants by Processing Stage. +5. Parse Purity and Ploidy Information (Purple Data). +6. Run PCGR (GRCh38 VEP 113 / `pcgr_ref_data.20250314`) to generate the final report. If PCGR struggles with very large VCFs, tune chunking with `--pcgr_variant_chunk_size` to cap variants per batch. + +### VCF to MAF conversion + +After filtering, the pipeline converts the somatic VCF to MAF using `vcf2maf` (v1.6.22) for downstream tools that expect MAF format. + +#### Output +- MAF file for the tumour/normal pair + - `${tumor_id}.maf` + +### Somatic Structural Variants + +The Somatic Structural Variants (SVs) pipeline identifies and annotates large-scale genomic alterations, including deletions, duplications, inversions, insertions, and translocations in tumor samples. Calls now come from eSVee (replacing GRIDSS/GRIPSS), but the downstream PURPLE/SnpEff/prioritisation steps remain unchanged. + +#### Summary +1. eSVee filtering: + - Refines the structural variant calls using read counts, panel-of-normals, known fusion hotspots, and repeat masker annotations. +2. PURPLE: + - Combines the eSVee-filtered SV calls with copy number variation (CNV) data and tumor purity/ploidy estimates. +3. Annotation: + - Combines SV calls with CNV data and annotates using [SnpEff](https://github.com/pcingola/SnpEff). +4. Prioritization: + - Prioritizes SV annotations based on [AstraZeneca-NGS](https://github.com/AstraZeneca-NGS/simple_sv_annotation) using curated reference data. +5. Report: + - Generates cancer report and MultiQC output. + +#### Inputs +- eSVee (GRIDSS/GRIPSS replacement) + - `${tumor_id}.esvee.somatic.vcf.gz` + +#### Steps +1. eSVee filtering: + - Evaluate split-read and paired-end support; discard variants with low support. + - Apply panel-of-normals filtering to remove artifacts observed in normal samples. + - Retain variants overlapping known oncogenic fusion hotspots (using UMCCR-curated lists). + - Exclude variants in repetitive regions based on Repeat Masker annotations. +2. PURPLE: + - Merge SV calls with CNV segmentation data. + - Estimate tumor purity and ploidy. + - Adjust SV breakpoints based on copy number transitions. + - Classify SVs as somatic or germline. +3. Annotation: + - Compile SV and CNV information into a unified VCF file. + - Extend the VCF header with PURPLE-related INFO fields (e.g., PURPLE_baf, PURPLE_copyNumber). + - Convert CNV records from TSV format into VCF records with appropriate SVTYPE tags (e.g., 'DUP' for duplications, 'DEL' for deletions). + - Run SnpEff to annotate the unified VCF with functional information such as gene names, transcript effects, and coding consequences. +4. Prioritization: + - Run the prioritization module (forked from the AstraZeneca simple_sv_annotation tool) using reference data files including known fusion pairs, known fusion 5′ and 3′ lists, key genes, and key tumor suppressor genes. + - Classify Variants: + - Structural Variants (SVs): Variants labeled with the source `sv_esvee`. + - Copy Number Variants (CNVs): Variants labeled with the source `cnv_purple`. +5. Prioritize variants on a 4-tier system using [prioritize_sv](https://github.com/umccr/vcf_stuff/blob/master/scripts/prioritize_sv.): + - **1 (high)** - **2 (moderate)** - **3 (low)** - **4 (no interest)** + - Exon loss: + - On cancer gene list (1) + - Other (2) + - Gene fusion: + - Paired (hits two genes): + - On list of known pairs (1) (curated by [HMF](https://resources.hartwigmedicalfoundation.nl)) + - One gene is a known promiscuous fusion gene (1) (curated by [HMF](https://resources.hartwigmedicalfoundation.nl)) + - On list of [FusionCatcher](https://github.com/ndaniel/fusioncatcher/blob/master/bin/generate_known.py) known pairs (2) + - Other: + - One or two genes on cancer gene list (2) + - Neither gene on cancer gene list (3) + - Unpaired (hits one gene): + - On cancer gene list (2) + - Others (3) + - Upstream or downstream: A specific type of fusion where one gene comes under the control of another gene's promoter, potentially leading to overexpression (oncogene) or underexpression (tumor suppressor gene): + - On cancer gene list genes (2) + - LoF or HIGH impact in a tumor suppressor: + - On cancer gene list (2) + - Other TS gene (3) + - Other (4) +6. Filter Low-Quality Calls: + - Apply Quality Filters: + - Keep variants with sufficient read support (e.g., split reads (SR) ≥ 5 and paired reads (PR) ≥ 5). + - Exclude Tier 3 and Tier 4 variants where `SR < 5` and `PR < 5`. + - Exclude Tier 3 and Tier 4 variants where `SR < 10`, `PR < 10`, and allele frequencies (`AF0` or `AF1`) are below 0.1. +7. Report: + - Generate MultiQC and cancer report outputs. + +### Germline Small Variants + +Filtering Select passing variants in the given [gene panel transcript regions](https://github.com/umccr/gene_panels/tree/main/germline_panel) made with PMCC familial cancer clinic list then make CPSR report. + +#### Inputs +- DRAGEN VCF + - `${normal_id}.hard-filtered.vcf.gz` + +#### Output +- CPSR report + - `${normal_id}.cpsr.grch38.html` + +#### Steps +1. Prepare: + - Selection of Passing Variants: + - Raw germline variant calls from DRAGEN are filtered to retain only those variants marked as PASS (or with no filter flag). + - Selection of Gene Panel Variants: + - The filtered variants are further restricted to regions defined by the [gene panel transcript regions file](https://github.com/umccr/gene_panels/tree/main/germline_panel), based on the PMCC familial cancer clinic list. +2. Report: + - Generate CPSR (Cancer Predisposition Sequencing Report) summarizing germline findings. + +--- + +## Common Reports + +### [Cancer Report](https://umccr.github.io/gpgr/) + +UMCCR cancer report containing: + +#### Tumor Mutation Burden (TMB) +- Data Source: filtered somatic VCF +- Tool: PURPLE + +#### Mutational Signatures +- Data Source: filtered somatic SNV VCF (Sigrap MutationalPatterns output) +- Tool: Sigrap (MutationalPatterns wrapper) + +#### Contamination Score + +- Data Source: – +- Note: No dedicated contamination metric is currently generated + +#### Purity & Ploidy +- Data Source: COBALT (providing read-depth ratios) and AMBER (providing B-allele frequency measurements) +- Tool: PURPLE, which uses these inputs to compute sample purity (percentage of tumor cells) and overall ploidy (average copy number) + +#### HRD Score +- Data Source: optional DRAGEN HRD score (`${tumor_id}.hrdscore.csv`), Sigrap HRDetect JSON, and oncoanalyser CHORD predictions +- Tool: DRAGEN HRD, Sigrap HRDetect, and CHORD + +#### MSI (Microsatellite Instability) +- Data Source: Indels in microsatellite regions from SNV/CNV +- Tool: PURPLE + +#### Structural Variant Metrics +- Data Source: eSVee SV VCF and PURPLE CNV segmentation +- Tools: eSVee, PURPLE, and the AstraZeneca simple_sv_annotation prioritisation rules + +#### Copy Number Metrics (Segments, Deleted Genes, etc.) +- Data Source: PURPLE CNV outputs (segmentation files, gene-level CNV TSV) +- Tool: PURPLE + +The LINX report includes the following: +- Tables of Variants: + - Breakends + - Links + - Driver Catalog +- Plots: + - Cluster-Level Plots + +### MultiQC + +General Stats: Overview of QC metrics aggregated from all tools, providing high-level sample quality information. + +DRAGEN: Mapping metrics (mapped reads, paired reads, duplicated alignments, secondary alignments), WGS coverage (average depth, cumulative coverage, per-contig coverage), fragment length distributions, trimming metrics, and time metrics for pipeline steps. + +PURPLE: Sample QC status (PASS/FAIL), ploidy, tumor purity, polyclonality percentage, tumor mutational burden (TMB), microsatellite instability (MSI) status, and variant metrics for somatic and germline SNPs/indels. + +BcfTools Stats: Variant substitution types, SNP and indel counts, quality scores, variant depth, and allele frequency metrics for both somatic and germline variants. + +DRAGEN-FastQC: Per-base sequence quality, per-sequence quality scores, GC content (per-sequence and per-position), HRD score, sequence length distributions, adapter contamination, and sequence duplication levels. + +### PCGR + +Personal Cancer Genome Reporter (PCGR) tool generates a comprehensive, interactive HTML report that consolidates filtered and annotated variant data, providing detailed insights into the somatic variants identified. + +Key Metrics: + +- Variant Classification and Tier Distribution: PCGR categorizes variants into tiers based on their clinical and biological significance. The report details the proportion of variants across different tiers, indicating their potential clinical relevance. +- Mutational Signatures: The report includes analysis of mutational signatures, offering insights into the mutational processes active in the tumor. +- Copy Number Alterations (CNAs): Visual representations of CNAs are provided, highlighting significant gains and losses across the genome. Genome-wide plots display regions of copy number gains and losses. +- Tumor Mutational Burden (TMB): Calculations of TMB are included, which can have implications for immunotherapy eligibility. The report presents the TMB value, representing the number of mutations per megabase. +- Microsatellite Instability (MSI) Status: Assessment of MSI status is performed, relevant for certain cancer types and treatment decisions. +- Clinical Trials Information: Information on relevant clinical trials is incorporated, offering potential therapeutic options based on the identified variants. + +Note: The PCGR tool is designed to process a maximum of 500,000 variants. If the input VCF file contains more than this limit, variants exceeding 500,000 will be filtered out. + +### CPSR Report + +The CPSR (Cancer Predisposition Sequencing Report) includes the following: + +Settings: +- Sample metadata +- Report configuration +- Virtual gene panel + +Summary of Findings: +- Variant statistics + +Variant Classification: + +ClinVar and Non-ClinVar variants: +- Class 5 - Pathogenic variants +- Class 4 - Likely Pathogenic variants +- Class 3 - Variants of Uncertain Significance (VUS) +- Class 2 - Likely Benign variants +- Class 1 - Benign variants +- Biomarkers + +PCGR TIER according to [ACMG](https://www.ncbi.nlm.nih.gov/pubmed/27993330): +- Tier 1 (High): Highest priority variants with strong clinical relevance. +- Tier 2 (Moderate): Variants with potential clinical significance. +- Tier 3 (Low): Variants with uncertain significance. +- Tier 4 (No Interest): Variants unlikely to be clinically relevant. + +--- + +## Coverage + +The sash workflow utilizes coverage metrics from DRAGEN to evaluate the sequencing quality and depth across target regions. Coverage analysis includes: + +- Mean coverage across targeted genomic regions +- Percentage of target regions covered at various depth thresholds (10X, 20X, 50X, 100X) +- Coverage uniformity metrics +- Gap analysis for regions with insufficient coverage + +These metrics are integrated into the MultiQC report, providing a comprehensive overview of sequencing quality and coverage. + +--- + +## Reference Data + +### [UMCCR Gene Panels](https://github.com/umccr/gene_panels) +Curated gene panels for specific analyses, including the germline cancer predisposition gene panel used in the Germline Small Variants workflow. + +### Genome Annotations + +#### HMFtools Reference Data +- Ensembl reference data (GRCh38) +- Somatic driver catalogs +- Known fusion gene pairs +- Driver gene panels + +#### Annotation Databases: +- gnomAD (v2.1): Provides population allele frequencies to help distinguish common variants from rare ones. +- ClinVar (20220103): Offers clinically curated variant information, aiding in the interpretation of potential pathogenicity. +- COSMIC: Contains data on somatic mutations found in cancer, facilitating the identification of cancer-related variants. +- Gene Panels: Focuses analysis on specific sets of genes relevant to particular conditions or research interests. + +#### Structural Variant Data: +- SnpEff Databases: Used for predicting the effects of variants on genes and proteins. +- Panel of Normals (PON): Helps filter out technical artifacts by comparing against a set of normal samples. +- RepeatMasker: Identifies repetitive genomic regions to prevent false-positive variant calls. + +Databases/datasets PCGR Reference Data: + +- Version: [`pcgr_ref_data.20250314.grch38.tgz`](https://github.com/sigven/pcgr/releases) with GRCh38 VEP 113 cache (`homo_sapiens_vep_113_GRCh38.tar.gz`). Both archives are auto-extracted by the `PREPARE_REFERENCE` subworkflow. +- Contents include refreshed ClinVar, COSMIC, dbNSFP, gnomAD, OncoKB/CGI biomarker sets, and PCGR/CPSR configuration files aligned with PCGR v2.x. + +--- + +## sash Module Outputs + +### Somatic SNVs +- File: `smlv_somatic/filter/{tumor_id}.pass.vcf.gz` +- Description: Contains somatic single nucleotide variants (SNVs) with filtering applied (VCF format). + +### Somatic SVs +- File: `sv_somatic/prioritise/{tumor_id}.sv.prioritised.vcf.gz` +- Description: Contains somatic structural variants (SVs) with prioritization applied (VCF format). + +### Somatic CNVs +- File: `cancer_report/cancer_report_tables/purple/{tumor_id}-purple_cnv_som.tsv.gz` +- Description: Contains somatic copy number variations (CNVs) data (TSV format). + +### Somatic Gene CNVs +- File: `cancer_report/cancer_report_tables/purple/{tumor_id}-purple_cnv_som_gene.tsv.gz` +- Description: Contains gene-level somatic copy number variations (CNVs) data (TSV format). + +### Germline SNVs +- File: `dragen_germline_output/{normal_id}.hard-filtered.vcf.gz` +- Description: Contains germline single nucleotide variants (SNVs) with hard filtering applied (VCF format). + +### Purple Purity, Ploidy, MS Status +- File: `purple/{tumor_id}.purple.purity.tsv` +- Description: Contains estimated tumor purity, ploidy, and microsatellite status (TSV format). + +### PCGR JSON with TMB +- File: `smlv_somatic/report/pcgr/{tumor_id}.pcgr.grch38.json.gz` +- Description: Contains PCGR annotations, including tumor mutational burden (TMB) (JSON format). + +### DRAGEN HRD Score (input) +- File: `${tumor_id}.hrdscore.csv` (from `dragen_somatic_dir`) +- Description: Optional DRAGEN homologous recombination deficiency (HRD) score propagated into the cancer report when provided. + +### Sigrap HRDetect +- File: `sigrap/hrdetect/hrdetect.json.gz` +- Description: HRDetect JSON summarising HRD probability from combined SNV/SV/CNV signals. + +### Sigrap MutationalPatterns +- Directory: `sigrap/mutpat/` +- Description: Mutational signature TSVs/plots (SBS/DBS/indels) generated by Sigrap’s MutationalPatterns wrapper. + +### Somatic MAF export +- File: `vcf2maf/{tumor_id}.maf` +- Description: MAF representation of the filtered somatic VCF for downstream tools that prefer MAF input. + +--- + +## FAQ + +### Q: Do we use PCGR for the rescue of SAGE? +A: Rescue is performed by BOLT using SAGE hotspot calls layered onto the DRAGEN VCF. PCGR is only used later for reporting/annotation; it does not drive the rescue step. + +### Q: How are hypermutated samples handled in the current version, and is there any impact on derived metrics such as TMB or MSI? +A: In the current version of sash, hypermutated samples are identified based on a threshold of 500,000 total somatic variant counts. If the variant count exceeds this threshold, the sample is flagged as hypermutated. When this occurs, we will filter variants that: 1) don't have clinical impact, 2) aren't in hotspot regions, until we meet the threshold. This impacts the TMB and MSI calculations by PURPLE. Currently, we are using the TMB and MSI values from PURPLE in these edge cases. A future release will provide correct TMB and MSI calculations from PURPLE. + +### Q: How are we handling non-standard chromosomes if present in the input VCFs (ALTs, chrM, etc)? +A: We filter on chromosomes 1-22 and chromosomes X, Y, M. All other non-standard chromosomes and contigs are filtered out. + +### Q: What inputs for the cancer reporter - have they changed (and what can we harmonize); e.g., where is the Circos plot from at this point? +A: Circos plots are generated by PURPLE. + +### Q: We dropped the CACAO coverage reports. Can we discuss how to utilize DRAGEN or HMFtools coverage information instead? +A: DRAGEN coverage metrics are now integrated into the MultiQC report, providing a comprehensive overview of sequencing quality and coverage across the genome. We are exploring further integration of HMFtools coverage analysis for future releases. + +### Q: What TMB score is displayed in the cancer reporter? +A: The cancer report surfaces the PURPLE-derived TMB; the PCGR HTML also reports its own TMB estimate for comparison. + +### Q: What filtered VCF is the source for the mutational signatures? +A: Sigrap MutationalPatterns uses the filtered somatic VCF (post-rescue and filtering); its outputs are published under `sigrap/mutpat/` and fed into the cancer report. + +### Q: Where is the contamination score coming from currently? +A: Currently, sash does not calculate a dedicated contamination metric. Tumor purity estimation from PURPLE serves as the primary indicator of sample quality. + +### Q: Do the SV steps do something more than what's happening in Oncoanalyser? +A: SASH reuses the WiGiTS export to re-run eSVee with UMCCR reference data and panel-of-normals, then applies PURPLE, SnpEff and simple_sv_annotation. GRIDSS/GRIPSS are no longer used. + +### Q: Does the data from Somatic Small Variants workflow get used for the SV analysis? +A: No, the somatic small variant workflow data is not used in the structural variant (SV) workflow. These are independent analyses that run in parallel. diff --git a/docs/images/sash_overview_qc.png b/docs/images/sash_overview_qc.png new file mode 100644 index 00000000..d4fa1a0b Binary files /dev/null and b/docs/images/sash_overview_qc.png differ diff --git a/docs/images/sash_workflow_overview_diagram_Vqc.pptx b/docs/images/sash_workflow_overview_diagram_Vqc.pptx new file mode 100644 index 00000000..177de0b9 Binary files /dev/null and b/docs/images/sash_workflow_overview_diagram_Vqc.pptx differ diff --git a/docs/images/sash_workflow_overview_diagram_qc/Slide1.png b/docs/images/sash_workflow_overview_diagram_qc/Slide1.png new file mode 100644 index 00000000..7f1e4374 Binary files /dev/null and b/docs/images/sash_workflow_overview_diagram_qc/Slide1.png differ diff --git a/docs/output.md b/docs/output.md index 734dd68c..e05db6ee 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,68 +1,407 @@ -# umccr/sash: Output +# Sash Output ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document outlines the key results and files produced by the UMCCR SASH (post-processing WGS tumor/normal) pipeline. After a run, the pipeline organizes output files by analysis module under a directory for each tumor/normal pair (identified by run ID and sample names). The main outputs include annotated variant reports for somatic and germline variants, copy number and structural variant analyses, and a comprehensive MultiQC report for quality control. All paths below are relative to the top-level results directory of a given run. +## Pipeline overview -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. +- [Sash Output](#sash-output) + - [Introduction](#introduction) + - [Pipeline overview](#pipeline-overview) + - [Directory Structure](#directory-structure) + - [Summary](#summary) + - [Workflows](#workflows) + - [Somatic Small Variants](#somatic-small-variants) + - [General](#general) + - [Summary](#summary-1) + - [Details](#details) + - [bolt smlv somatic rescue](#bolt-smlv-somatic-rescue) + - [BOLT\_SMLV\_SOMATIC\_ANNOTATE](#bolt_smlv_somatic_annotate) + - [BOLT\_SMLV\_SOMATIC\_FILTER](#bolt_smlv_somatic_filter) + - [SOMATIC\_SNV\_REPORTS](#somatic_snv_reports) + - [VCF2MAF](#vcf2maf) + - [Somatic Structural Variants](#somatic-structural-variants) + - [General](#general-1) + - [Summary](#summary-2) + - [SV Annotation](#sv-annotation) + - [SV Prioritization](#sv-prioritization) + - [Germline Variants](#germline-variants) + - [General](#general-2) + - [Summary](#summary-3) + - [Germline Preparation](#germline-preparation) + - [Germline Reports](#germline-reports) + - [Reports](#reports) + - [Cancer Report](#cancer-report) + - [LINX Reports](#linx-reports) + - [PURPLE Reports](#purple-reports) + - [PCGR Reports](#pcgr-reports) + - [SIGRAP Reports](#sigrap-reports) + - [CPSR Reports](#cpsr-reports) + - [MultiQC Reports](#multiqc-reports) - +## Directory Structure -## Pipeline overview +```bash +[RUN_ID]/[sample]/ +├── .cancer_report.html +├── .cpsr.html +├── .pcgr.html +├── _linx.html +├── .multiqc.html +├── cancer_report/ +│ ├── img/ +│ └── cancer_report_tables/ +│ ├── hrd/ +│ ├── json/ +│ ├── purple/ +│ └── sigs/ +├── linx/ +│ ├── germline_annotations/ +│ ├── somatic_annotations/ +│ └── somatic_plots/ +├── multiqc_data/ +├── purple/ +├── smlv_germline/ +│ └── prepare/ +│ └── report/ +├── smlv_somatic/ +│ ├── report/ +│ ├── annotate/ +│ ├── filter/ +│ └── rescue/ +└── sv_somatic/ + ├── annotate/ + └── prioritise/ +``` +## Summary + +The **Sash Workflow** comprises three primary pipelines: **Somatic Small Variants**, **Somatic Structural Variants**, and **Germline Variants**. These pipelines utilize **Bolt**, a Python package designed for modular processing, and leverage outputs from the **DRAGEN Variant Caller** alongside **HMFtools in Oncoanalyser**. Each pipeline is tailored to a specific type of genomic variant, incorporating filtering, annotation, and HTML reports for research and curation. + +## Workflows + +### Somatic Small Variants + +#### General + +In the **Somatic Small Variants** workflow, variant detection is performed using the **DRAGEN Variant Caller** and **Oncoanalyser (SAGE, Purple)** outputs. It's structured into four steps: **Rescue**, **Annotation**, **Filter**, and **Report**. The final outputs include an **HTML report** summarizing the results. + +#### Summary + +1. **Rescue** variants using SAGE to recover low-frequency alterations in clinically important hotspots. +2. **Annotate** variants with clinical and functional information using PCGR. +3. **Filter** variants based on allele frequency (AF), supporting reads (AD), and population frequency (gnomAD AF), removing low-confidence and common variants. +4. **Report** final annotated variants in a comprehensive HTML format. + +### Details + +## bolt smlv somatic rescue + +
+Output files + +- `smlv_somatic/rescue/` + - `.rescued.vcf.gz`: Rescued somatic VCF file containing previously filtered variants at known hotspots. + +
+ +The `BOLT_SMLV_SOMATIC_RESCUE` process rescues somatic variants using the BOLT tool. The output includes the rescued VCF file that recovers potentially important variants that may have been filtered in earlier steps due to borderline quality metrics. + +#### BOLT_SMLV_SOMATIC_ANNOTATE + +
+Output files + +- `smlv_somatic/annotate/` + - `.annotations.vcf.gz`: Annotated somatic VCF file with functional and clinical annotations. + +
+ +The `BOLT_SMLV_SOMATIC_ANNOTATE` process annotates somatic variants using the BOLT tool. The output includes the annotated VCF file enriched with gene information, variant effect predictions, and other annotations to aid in variant interpretation. + +#### BOLT_SMLV_SOMATIC_FILTER + +
+Output files + +- `smlv_somatic/filter/` + - `.filters_set.vcf.gz`: VCF file with filters set but all variants retained. + - `.pass.vcf.gz`: Filtered somatic VCF file containing only PASS variants. + - `.pass.vcf.gz.tbi`: Index file for the filtered VCF. + +
+ +The `BOLT_SMLV_SOMATIC_FILTER` process filters somatic variants using the BOLT tool. The output includes both a VCF with all variants but filter tags applied, and a filtered VCF containing only variants that pass all quality filters. + +#### SOMATIC_SNV_REPORTS + +
+Output files + +- `smlv_somatic/report/` + - `.somatic.bcftools_stats.txt`: Statistical summary of somatic variants. + - `.somatic.variant_counts_process.json`: Variant count metrics at each processing step. + - `.somatic.variant_counts_type.yaml`: Variant counts by variant type. + - `af_tumor.txt`: Information about variant allele frequencies in tumor. + - `af_tumor_keygenes.txt`: Variant allele frequencies in key cancer-related genes. + - `pcgr/`: Directory containing PCGR report outputs. + +
+ +The reporting process generates statistical summaries and specialized reports for somatic SNVs, including PCGR HTML reports for clinical interpretation. + +#### VCF2MAF + +
+Output files + +- `vcf2maf/.maf`: MAF derived from the filtered somatic VCF. + +
+ +`vcf2maf` (v1.6.22) converts the filtered somatic VCF into MAF format for downstream consumers that do not accept VCF. + +### Somatic Structural Variants + +#### General + +The **Somatic Structural Variants** workflow identifies and analyzes large genomic rearrangements such as deletions, duplications, inversions, and translocations. It rebuilds SVs with eSVee using the oncoanalyser WiGiTS export, layers PURPLE CNV/purity estimates, and then annotates/prioritises events for GPGR/LINX reporting. + +#### Summary + +1. **Annotate** SVs with gene context and potential functional impacts. +2. **Prioritize** SVs based on cancer relevance and gene disruption potential. +3. **Report** clinically relevant SVs with gene fusion predictions and visualization. + +#### SV Annotation + +
+Output files -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +- `sv_somatic/annotate/` + - `.annotated.vcf.gz`: Annotated structural variant VCF file. -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +
+ +This process adds gene annotations and functional impact predictions to structural variants. The annotated VCF contains information about genes affected by breakpoints, potential fusion events, and other biologically relevant details. + +#### SV Prioritization + +
+Output files + +- `sv_somatic/prioritise/` + - `.cnv.prioritised.tsv`: Prioritized copy number variations in tabular format. + - `.sv.prioritised.tsv`: Prioritized structural variants in tabular format. + - `.sv.prioritised.vcf.gz`: Prioritized structural variants in VCF format. + +
+ +This process ranks structural variants based on their potential clinical relevance, creating filterable lists for review. It separately handles copy number variations and other structural variants for easier interpretation. + +### Germline Variants + +#### General + +The **Germline Variants** workflow analyzes inherited variants from the normal sample to identify potential cancer predisposition genes and variants that may influence treatment decisions. + +#### Summary + +1. **Prepare** germline variants from DRAGEN normal sample outputs. +2. **Report** potentially actionable germline variants through CPSR. -### FastQC +#### Germline Preparation
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `smlv_germline/prepare/` + - `.prepared.vcf.gz`: Prepared germline VCF file for annotation.
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +This process prepares germline variants for downstream annotation and reporting. It applies normalization, left-alignment, and other preprocessing steps to ensure consistent variant representation. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +#### Germline Reports -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files + +- `smlv_germline/report/` + - `.annotations.vcf.gz`: Annotated germline VCF. + - `.germline.bcftools_stats.txt`: Statistical summary of germline variants. + - `.germline.variant_counts_type.yaml`: Variant counts by type. + - `cpsr/`: Directory containing CPSR outputs. + - `.cpsr.grch38.json.gz`: Structured CPSR data. + - `.cpsr.grch38.pass.tsv.gz`: Filtered CPSR variants in tabular format. + - `.cpsr.grch38.snvs_indels.tiers.tsv`: Tiered variants by clinical significance. + - Other CPSR output files. + +
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +The germline reporting process focuses on identifying variants in cancer predisposition genes and producing a comprehensive CPSR (Cancer Predisposition Sequencing Reporter) report. -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +### Reports -### MultiQC +#### Cancer Report
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `.cancer_report.html`: Main cancer report HTML file. +- `cancer_report/` + - `.snvs.normalised.vcf.gz`: Normalized SNVs used in the report. + - `img/`: Images used in the cancer report. + - `cancer_report_tables/`: Tabular data supporting the report. + - `_-qc_summary.tsv.gz`: Quality control summary. + - `_-report_inputs.tsv.gz`: Report configuration inputs. + - `hrd/`: Homologous Recombination Deficiency analysis from multiple methods. + - `json/`: JSON-formatted report data. + - `purple/`: Copy number information. + - `sigs/`: Mutational signature analysis (SBS, DBS, indels).
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +The cancer report integrates findings from all analysis modules into a comprehensive HTML report for clinical interpretation. It includes tumor characteristics, key somatic alterations, mutational signatures, and therapy recommendations. + +#### LINX Reports + +
+Output files + +- `_linx.html`: LINX visualization report. +- `linx/` + - `germline_annotations/`: Germline structural variant analysis. + - `.linx.germline.breakend.tsv`: Germline breakend annotations. + - `.linx.germline.clusters.tsv`: Germline SV clusters. + - `.linx.germline.disruption.tsv`: Gene disruptions by germline SVs. + - `.linx.germline.driver.catalog.tsv`: Potential driver germline SVs. + - `.linx.germline.links.tsv`: Links between germline SVs. + - `.linx.germline.svs.tsv`: Germline structural variants. + - `linx.version`: LINX version information. + - `somatic_annotations/`: Somatic structural variant analysis. + - `.linx.breakend.tsv`: Somatic breakend annotations. + - `.linx.clusters.tsv`: Somatic SV clusters. + - `.linx.driver.catalog.tsv`: Potential driver somatic SVs. + - `.linx.drivers.tsv`: Driver SV details. + - `.linx.fusion.tsv`: Gene fusion predictions. + - `.linx.links.tsv`: Links between somatic SVs. + - `.linx.svs.tsv`: Somatic structural variants. + - Visualization data files (vis_*). + - `linx.version`: LINX version information. + - `somatic_plots/`: Visualizations of somatic structural variants. + +
+ +LINX reports provide detailed analysis of structural variants, including gene fusions, disruptions, and visualization of complex rearrangements. The HTML visualization report offers interactive exploration of structural variants and their potential functional impacts. + +#### PURPLE Reports + +
+Output files + +- `purple/` + - `.purple.cnv.gene.tsv`: Gene-level copy number variations. + - `.purple.cnv.somatic.tsv`: Segment-level somatic copy number variations. + - `.purple.driver.catalog.germline.tsv`: Potential germline driver variants. + - `.purple.driver.catalog.somatic.tsv`: Potential somatic driver variants. + - `.purple.germline.deletion.tsv`: Germline deletion information. + - `.purple.purity.range.tsv`: Range of possible purity values. + - `.purple.purity.tsv`: Tumor purity, ploidy, and microsatellite status. + - `.purple.qc`: Quality control metrics. + - `.purple.segment.tsv`: Genomic segmentation data. + - `.purple.somatic.clonality.tsv`: Clonality analysis of somatic variants. + - `.purple.somatic.hist.tsv`: Somatic variant histograms. + - `.purple.somatic.vcf.gz`: Somatic variants VCF with copy number annotations. + - `.purple.sv.germline.vcf.gz`: Germline structural variants. + - `.purple.sv.vcf.gz`: Somatic structural variants. + - `circos/`: Circos visualization data files. + - `.ratio.circos`: Normal sample coverage ratio data. + - `.baf.circos`: B-allele frequency data. + - `.cnv.circos`: Copy number data. + - `.indel.circos`: Indel visualization data. + - `.link.circos`: SV links visualization data. + - `.snp.circos`: SNP visualization data. + - Configuration and input files for Circos. + - `plot/`: Additional visualization data. + - `purple.version`: PURPLE version information. + +
+ +PURPLE reports provide copy number analysis, tumor purity estimation, and whole genome doubling assessment. The circos directory contains data for generating circular genome plots that visualize genomic alterations across the entire genome. + +#### PCGR Reports + +
+Output files + +- `.pcgr.html`: PCGR HTML report. +- `smlv_somatic/report/pcgr/` + - `.pcgr.grch38.flexdb.html`: Flexible database PCGR report. + - `.pcgr.grch38.json.gz`: Structured PCGR data in JSON format. + - `.pcgr.grch38.mp_input.vcf.gz`: Input VCF for mutational pattern analysis. + - `.pcgr.grch38.mutational_signatures.tsv`: Mutational signature analysis. + - `.pcgr.grch38.pass.tsv.gz`: Filtered variants in tabular format. + - `.pcgr.grch38.pass.vcf.gz`: Filtered variants in VCF format. + - `.pcgr.grch38.snvs_indels.tiers.tsv`: Tiered variants by clinical significance. + - `.pcgr.grch38.vcf.gz`: All variants in VCF format. + - `.pcgr_config.rds`: PCGR configuration. + +
+ +PCGR (Personal Cancer Genome Reporter) reports provide clinical interpretation of somatic variants, including therapy matches, clinical trial eligibility, and tumor mutational burden assessment. + +#### SIGRAP Reports + +
+Output files + +- `sigrap/mutpat/`: SBS/DBS/indel signature TSVs and plots from Sigrap’s MutationalPatterns wrapper. +- `sigrap/hrdetect/hrdetect.json.gz`: HRDetect JSON summarising HRD probability from SNV/SV/CNV features. +- Cancer report tables fold these results under `cancer_report/cancer_report_tables/sigs/`. + +
+ +SIGRAP reports provide mutational signature analysis, identifying patterns associated with specific mutational processes or exposures, and HRD probability via HRDetect. Outputs are published separately and embedded into the cancer report; CHORD predictions from oncoanalyser are ingested alongside HRDetect. + +#### CPSR Reports + +
+Output files + +- `.cpsr.html`: CPSR HTML report. +- `smlv_germline/report/cpsr/` + - `.cpsr.grch38.custom_list.bed`: Custom gene list in BED format. + - `.cpsr.grch38.json.gz`: Structured CPSR data in JSON format. + - `.cpsr.grch38.pass.tsv.gz`: Filtered variants in tabular format. + - `.cpsr.grch38.pass.vcf.gz`: Filtered variants in VCF format. + - `.cpsr.grch38.snvs_indels.tiers.tsv`: Tiered variants by clinical significance. + - `.cpsr.grch38.vcf.gz`: All variants in VCF format. + - `.cpsr_config.rds`: CPSR configuration. + +
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +CPSR (Cancer Predisposition Sequencing Reporter) focuses on germline variants in known cancer predisposition genes, providing a comprehensive report of inherited cancer risk variants. -### Pipeline information +#### MultiQC Reports
Output files -- `pipeline_info/` - - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. +- `.multiqc.html`: Main MultiQC report. +- `multiqc_data/`: Supporting data for the MultiQC report. + - `dragen_frag_len.txt`: Fragment length metrics. + - `dragen_map_metrics.txt`: Mapping metrics. + - `dragen_ploidy.txt`: Ploidy estimation metrics. + - `dragen_time_metrics.txt`: Processing time metrics. + - `dragen_trimmer_metrics.txt`: Read trimming metrics. + - `dragen_vc_metrics.txt`: Variant calling metrics. + - `dragen_wgs_cov_metrics.txt`: WGS coverage metrics. + - `multiqc.log`: MultiQC log file. + - `multiqc_bcftools_stats.txt`: BCFtools statistics. + - `multiqc_data.json`: MultiQC data in JSON format. + - `multiqc_general_stats.txt`: General statistics. + - `purple.txt`: PURPLE metrics.
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +MultiQC aggregates quality metrics from all pipeline components into a single HTML report, providing an overview of sample quality and analysis performance. diff --git a/docs/usage.md b/docs/usage.md index 37f6aac6..a44af4e3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,194 +1,180 @@ # umccr/sash: Usage -> _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ +> Parameter documentation is generated automatically from `nextflow_schema.json`. Run `nextflow run umccr/sash --help` +> or use [nf-core/launch](https://nf-co.re/launch) for an interactive form. ## Introduction - +umccr/sash is UMCCR’s post-processing pipeline for tumour/normal WGS analyses. It consumes DRAGEN secondary-analysis +outputs together with nf-core/oncoanalyser WiGiTS artefacts to perform small-variant rescue, annotation, filtering, +structural variant integration, PURPLE CNV calling, and reporting (PCGR, CPSR, GPGR cancer report, LINX, MultiQC). + +- Requires Nextflow ≥ 22.10.6 and a container engine (Docker/Singularity/Apptainer/Podman/Conda). +- Uses GRCh38 reference data resolved from `--ref_data_path` (see [Reference data](#reference-data)). +- Expects inputs via a CSV samplesheet describing DRAGEN and Oncoanalyser directories; no FASTQ inputs are needed. ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +Pass a CSV with `--input`. Each row represents one directory staged by upstream pipelines for a given analysis `id`. +Rows sharing the same `id` are grouped into a single tumour/normal run. -```bash ---input '[path to samplesheet file]' -``` +### Column definitions -### Multiple runs of the same sample +| Column | Description | +| -------------- | ----------- | +| `id` | Unique analysis identifier grouping rows belonging to the same tumour/normal pair. | +| `subject_name` | Subject identifier; must be identical for all rows with the same `id`. | +| `sample_name` | DRAGEN sample label. Used to derive tumour (`dragen_somatic_dir`) and normal (`dragen_germline_dir`) identifiers. | +| `filetype` | One of the supported directory types below. | +| `filepath` | Absolute or relative path to the directory containing the expected files. | -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +Example row set: -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +```csv +id,subject_name,sample_name,filetype,filepath +subject_a.example,subject_a,sample_germline,dragen_germline_dir,/path/to/dragen_germline/ +subject_a.example,subject_a,sample_somatic,dragen_somatic_dir,/path/to/dragen_somatic/ +subject_a.example,subject_a,sample_somatic,oncoanalyser_dir,/path/to/oncoanalyser/ ``` -### Full samplesheet +An example sheet is included at `assets/samplesheet.csv`. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +### Required directory contents -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +Paths below are relative to the value of `filepath` for each row. The pipeline targets nf-core/oncoanalyser ≥ 2.2.0 +exports. -```console -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +- `dragen_somatic_dir` + - `.hard-filtered.vcf.gz` and `.hard-filtered.vcf.gz.tbi` + - Optional: `.hrdscore.csv` (ingested into the cancer report when present) +- `dragen_germline_dir` + - `.hard-filtered.vcf.gz` +- `oncoanalyser_dir` + - `amber/` and `cobalt/` directories (coverage inputs for PURPLE) + - `sage_calling/somatic/.sage.somatic.vcf.gz` (+ `.tbi`) + - `esvee/.esvee.ref_depth.vcf.gz` and accompanying directory (used to seed eSVee calling) + - `virusbreakend/` directory + - Optional: `chord/.chord.prediction.tsv` (HRD predictions surfaced in the cancer report) -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +> SASH runs PURPLE internally; precomputed PURPLE outputs are not required as inputs. ## Running the pipeline -The typical command for running the pipeline is as follows: +Quickstart command: ```bash -nextflow run umccr/sash --input samplesheet.csv --outdir --genome GRCh37 -profile docker +nextflow run umccr/sash \ + --input samplesheet.csv \ + --ref_data_path /path/to/reference_data_root \ + --outdir results/ \ + -profile docker ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. - -Note that the pipeline will create the following files in your working directory: +This launches the pipeline with the `docker` configuration profile. The following appear in the working directory: -```bash -work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` +work/ # Nextflow working files +results/ # Pipeline outputs (as specified by --outdir) +.nextflow_log # Nextflow run log ``` -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. - -Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. +### Parameter files and profiles -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +Reuse parameter sets via `-params-file params.yaml`: ```bash nextflow run umccr/sash -profile docker -params-file params.yaml ``` -with `params.yaml` containing: - ```yaml -input: './samplesheet.csv' -outdir: './results/' -genome: 'GRCh37' -input: 'data' -<...> -``` - -You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). - -### Updating the pipeline - -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: - -```bash -nextflow pull umccr/sash +input: 'samplesheet.csv' +ref_data_path: '/data/refdata' +outdir: 'results/' ``` -### Reproducibility +> ⚠️ Avoid using `-c` to pass pipeline parameters. `-c` should only point to Nextflow config files for resource tuning, +> executor settings or module overrides (see below). -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +You can generate YAML/JSON parameter files through [nf-core/launch](https://nf-co.re/launch) or Nextflow Tower. -First, go to the [umccr/sash releases page](https://github.com/umccr/sash/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +## Reference data -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. +All resources are resolved relative to `--ref_data_path` using `conf/refdata.config`. Confirm the directory contains the +expected subpaths (versions may change between releases): -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +- `genomes/GRCh38_umccr/` – GRCh38 FASTA, FAI and dict files plus sequence metadata. +- `hmf_reference_data/` – WiGiTS bundle with PURPLE GC profiles, eSVee panel-of-normals, SAGE hotspot resources, LINX + transcripts and driver catalogues. +- `databases/pcgr/` – PCGR/CPSR bundle as `pcgr_ref_data..grch38.tgz` plus GRCh38 VEP cache `homo_sapiens_vep_113_GRCh38.tar.gz` (both auto-extracted by the pipeline). +- `umccr/` – bolt configuration files, driver panels, MultiQC templates, GPGR assets. +- `misc/` – panel-of-normals, APPRIS annotations, snpEff cache and other supporting data. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +Compressed PCGR/VEP archives can be left in place; the `PREPARE_REFERENCE` stage extracts them into the run work directory before use. -## Core Nextflow arguments +Refer to [details.md](details.md) for a deeper breakdown of required artefacts. -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +## Nextflow configuration ### `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. - -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. - -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. - -Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! -They are loaded in sequence, so later profiles can overwrite earlier profiles. - -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. - -- `test` - - A profile with a complete configuration for automated testing - - Includes links to test data so needs no other parameters -- `docker` - - A generic configuration profile to be used with [Docker](https://docker.com/) -- `singularity` - - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) -- `podman` - - A generic configuration profile to be used with [Podman](https://podman.io/) -- `shifter` - - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) -- `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) -- `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. +Profiles configure software packaging and cluster backends. Bundled profiles include `test`, `docker`, `singularity`, +`podman`, `shifter`, `charliecloud`, `apptainer` and `conda`. Combine multiple profiles with commas (later entries +override earlier ones). If no profile is supplied, Nextflow expects all software on `$PATH`, which is discouraged. ### `-resume` -Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). - -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. +Resume cached work by adding `-resume`. Nextflow matches stages using both file names and content; keep inputs identical +for cache hits. Supply a run name to resume a specific execution: `-resume `. Use `nextflow log` to list +previous runs. ### `-c` -Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. +`-c custom.config` loads additional Nextflow configuration (eg. executor queues, resource overrides, institutional +profiles). See the [nf-core configuration docs](https://nf-co.re/docs/usage/configuration) for examples. ## Custom configuration ### Resource requests -Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. +Default resources suit typical datasets, but you can override CPUs/memory/time through custom config files. Many modules +honour nf-core’s automatic retry logic: certain exit codes trigger resubmission at 2× and 3× the original resources +before failing the run. Refer to the nf-core guides on +[max resources](https://nf-co.re/docs/usage/configuration#max-resources) and +[tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources). -To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. +### Custom containers -### Custom Containers +nf-core pipelines default to Biocontainers/Bioconda images. You can override container or conda package selections in +config to use patched or institutional builds. See the +[updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section for patterns. -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. +### Custom tool arguments -To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. +If you need to provide additional tool parameters beyond those exposed by pipeline options, set `process.ext.args` +(overrides per-module) or leverage module-specific hooks documented in nf-core. Review `conf/modules.config` for +supported overrides in umccr/sash. PCGR chunk sizing can also be overridden globally via `--pcgr_variant_chunk_size` +if PCGR struggles with exceptionally large VCFs. -### Custom Tool Arguments +## Outputs -A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. +See [output.md](output.md) for a full description of generated artefacts (PCGR/CPSR HTML, cancer report, LINX, PURPLE, MultiQC +and supporting statistics). ## Running in the background -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. - -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. +Nextflow supervises submitted jobs; keep the Nextflow process alive for the pipeline to finish. Options include: -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). +- `nextflow run ... -bg` to launch detached and log to `.nextflow.log`. +- Using `screen`, `tmux` or similar to keep sessions alive. +- Submitting Nextflow itself through your scheduler (eg. `sbatch`), where it will launch child jobs. ## Nextflow memory requirements -In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. -We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): +The Nextflow JVM can request substantial RAM on large runs. Set an upper bound via environment variables, typically in +`~/.bashrc` or `~/.bash_profile`: ```bash -NXF_OPTS='-Xms1g -Xmx4g' +export NXF_OPTS='-Xms1g -Xmx4g' ``` + +Adjust limits to suit your environment. diff --git a/modules/local/bolt/other/cancer_report/main.nf b/modules/local/bolt/other/cancer_report/main.nf index 19469ca1..dff58fae 100644 --- a/modules/local/bolt/other/cancer_report/main.nf +++ b/modules/local/bolt/other/cancer_report/main.nf @@ -2,10 +2,10 @@ process BOLT_OTHER_CANCER_REPORT { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-gpgr' + container 'docker.io/qclayssen/bolt:0.3.0-dev-14-gpgr' input: - tuple val(meta), path(smlv_somatic_vcf), path(smlv_somatic_bcftools_stats), path(smlv_somatic_counts_process), path(sv_somatic_tsv), path(sv_somatic_vcf), path(cnv_somatic_tsv), path(af_global), path(af_keygenes), path(purple_baf_plot), path(purple_dir), path(virusbreakend_dir), path(dragen_hrd) + tuple val(meta), path(smlv_somatic_vcf), path(smlv_somatic_bcftools_stats), path(smlv_somatic_counts_process), path(sv_somatic_tsv), path(sv_somatic_vcf), path(cnv_somatic_tsv), path(af_global), path(af_keygenes), path(purple_baf_plot), path(purple_dir), path(virusbreakend_dir), path(dragen_hrd), path(smlv_somatic_mutpat), path(smlv_somatic_hrdetect), path(smlv_somatic_chord) path somatic_driver_panel path oncokb_genes @@ -35,6 +35,10 @@ process BOLT_OTHER_CANCER_REPORT { --smlv_somatic_bcftools_stats_fp \$(pwd)/${smlv_somatic_bcftools_stats} \\ --smlv_somatic_counts_process_fp \$(pwd)/${smlv_somatic_counts_process} \\ \\ + --mutpat_dir \$(pwd)/${smlv_somatic_mutpat} \\ + --hrdetect_file \$(pwd)/${smlv_somatic_hrdetect} \\ + --chord_file \$(pwd)/${smlv_somatic_chord} \\ + \\ --sv_somatic_tsv_fp \$(pwd)/${sv_somatic_tsv} \\ --sv_somatic_vcf_fp \$(pwd)/${sv_somatic_vcf} \\ --cnv_somatic_tsv_fp \$(pwd)/${cnv_somatic_tsv} \\ diff --git a/modules/local/bolt/other/multiqc_report/main.nf b/modules/local/bolt/other/multiqc_report/main.nf index f26d6921..a5752f09 100644 --- a/modules/local/bolt/other/multiqc_report/main.nf +++ b/modules/local/bolt/other/multiqc_report/main.nf @@ -2,7 +2,7 @@ process BOLT_OTHER_MULTIQC_REPORT { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-multiqc' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-multiqc' input: tuple val(meta), path(input_files) diff --git a/modules/local/bolt/other/purple_baf_plot/main.nf b/modules/local/bolt/other/purple_baf_plot/main.nf index c52d8202..6ccb318a 100644 --- a/modules/local/bolt/other/purple_baf_plot/main.nf +++ b/modules/local/bolt/other/purple_baf_plot/main.nf @@ -2,7 +2,7 @@ process BOLT_OTHER_PURPLE_BAF_PLOT { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-circos' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-circos' input: tuple val(meta), path(purple_dir) diff --git a/modules/local/bolt/smlv_germline/prepare/main.nf b/modules/local/bolt/smlv_germline/prepare/main.nf index 2fc3cc4a..615078dd 100644 --- a/modules/local/bolt/smlv_germline/prepare/main.nf +++ b/modules/local/bolt/smlv_germline/prepare/main.nf @@ -2,7 +2,7 @@ process BOLT_SMLV_GERMLINE_PREPARE { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20' input: tuple val(meta), path(smlv_vcf) diff --git a/modules/local/bolt/smlv_germline/report/main.nf b/modules/local/bolt/smlv_germline/report/main.nf index 3f5fdea9..0acfffde 100644 --- a/modules/local/bolt/smlv_germline/report/main.nf +++ b/modules/local/bolt/smlv_germline/report/main.nf @@ -2,12 +2,13 @@ process BOLT_SMLV_GERMLINE_REPORT { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-pcgr' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-pcgr' input: tuple val(meta), path(smlv_vcf), path(smlv_unfiltered_vcf) path germline_predisposition_panel_genes path pcgr_data_dir + path vep_dir output: tuple val(meta), path("output/*.variant_counts_type.yaml"), emit: counts_type @@ -32,6 +33,7 @@ process BOLT_SMLV_GERMLINE_REPORT { --pcgrr_conda pcgrr \\ --germline_panel_list_fp ${germline_predisposition_panel_genes} \\ --pcgr_data_dir ${pcgr_data_dir} \\ + --vep_dir ${vep_dir} \\ --threads ${task.cpus} \\ --output_dir output/ diff --git a/modules/local/bolt/smlv_somatic/annotate/main.nf b/modules/local/bolt/smlv_somatic/annotate/main.nf index d28aeeb4..f6bb82cc 100644 --- a/modules/local/bolt/smlv_somatic/annotate/main.nf +++ b/modules/local/bolt/smlv_somatic/annotate/main.nf @@ -2,7 +2,7 @@ process BOLT_SMLV_SOMATIC_ANNOTATE { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-pcgr' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-pcgr' input: tuple val(meta), path(smlv_vcf) @@ -10,6 +10,7 @@ process BOLT_SMLV_SOMATIC_ANNOTATE { path annotations_dir path pon_dir path pcgr_data_dir + path vep_dir output: tuple val(meta), path("output/${meta.tumor_id}.annotations.vcf.gz"), emit: vcf @@ -19,7 +20,7 @@ process BOLT_SMLV_SOMATIC_ANNOTATE { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def chunk_size_arg = params.pcgr_variant_chunk_size ? "--pcgr_variant_chunk_size ${params.pcgr_variant_chunk_size}" : '' """ bolt smlv_somatic annotate \\ @@ -30,10 +31,12 @@ process BOLT_SMLV_SOMATIC_ANNOTATE { --annotations_dir ${annotations_dir} \\ --pon_dir ${pon_dir} \\ --pcgr_data_dir ${pcgr_data_dir} \\ + --vep_dir ${vep_dir} \\ --pcgr_conda pcgr \\ --pcgrr_conda pcgrr \\ --threads ${task.cpus} \\ - --output_dir output/ + --output_dir output/ \\ + ${chunk_size_arg} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/bolt/smlv_somatic/filter/main.nf b/modules/local/bolt/smlv_somatic/filter/main.nf index 93519a82..f76db5f8 100644 --- a/modules/local/bolt/smlv_somatic/filter/main.nf +++ b/modules/local/bolt/smlv_somatic/filter/main.nf @@ -2,7 +2,7 @@ process BOLT_SMLV_SOMATIC_FILTER { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20' input: tuple val(meta), path(smlv_vcf) diff --git a/modules/local/bolt/smlv_somatic/report/main.nf b/modules/local/bolt/smlv_somatic/report/main.nf index cbc160c1..15225680 100644 --- a/modules/local/bolt/smlv_somatic/report/main.nf +++ b/modules/local/bolt/smlv_somatic/report/main.nf @@ -2,11 +2,12 @@ process BOLT_SMLV_SOMATIC_REPORT { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-pcgr' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-pcgr' input: tuple val(meta), path(smlv_vcf), path(smlv_filters_vcf), path(smlv_dragen_vcf), path(purple_purity) path pcgr_data_dir + path vep_dir path somatic_driver_panel_regions_coding path giab_regions path genome_fasta @@ -19,7 +20,7 @@ process BOLT_SMLV_SOMATIC_REPORT { tuple val(meta), path("output/*.variant_counts_type.yaml") , emit: counts_type tuple val(meta), path("output/*.variant_counts_process.json"), emit: counts_process path 'output/pcgr/' , emit: pcgr_dir - path "output/*.pcgr_acmg.grch38.html" , emit: pcgr_report + path "output/*.pcgr.grch38.html" , emit: pcgr_report path 'versions.yml' , emit: versions when: @@ -40,6 +41,7 @@ process BOLT_SMLV_SOMATIC_REPORT { --pcgr_conda pcgr \\ --pcgrr_conda pcgrr \\ --pcgr_data_dir ${pcgr_data_dir} \\ + --vep_dir ${vep_dir} \\ --purple_purity_fp ${purple_purity} \\ \\ --cancer_genes_fp ${somatic_driver_panel_regions_coding} \\ @@ -49,7 +51,7 @@ process BOLT_SMLV_SOMATIC_REPORT { --threads ${task.cpus} \\ --output_dir output/ - mv output/pcgr/${meta.tumor_id}.pcgr_acmg.grch38.html output/ + mv output/pcgr/${meta.tumor_id}.pcgr.grch38.html output/ cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -66,7 +68,7 @@ process BOLT_SMLV_SOMATIC_REPORT { touch output/${meta.tumor_id}.somatic.variant_counts_type.yaml touch output/${meta.tumor_id}.somatic.variant_counts_process.json touch output/${meta.tumor_id}.somatic.bcftools_stats.txt - touch output/${meta.tumor_id}.pcgr_acmg.grch38.html + touch output/${meta.tumor_id}.pcgr.grch38.html echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml """ } diff --git a/modules/local/bolt/smlv_somatic/rescue/main.nf b/modules/local/bolt/smlv_somatic/rescue/main.nf index 2e1ac64b..f274231d 100644 --- a/modules/local/bolt/smlv_somatic/rescue/main.nf +++ b/modules/local/bolt/smlv_somatic/rescue/main.nf @@ -2,7 +2,7 @@ process BOLT_SMLV_SOMATIC_RESCUE { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20' input: tuple val(meta), path(smlv_vcf), path(smlv_tbi), path(sage_smlv_vcf), path(sage_smlv_tbi) diff --git a/modules/local/bolt/sv_somatic/annotate/main.nf b/modules/local/bolt/sv_somatic/annotate/main.nf index af48b4f5..ecd27b5c 100644 --- a/modules/local/bolt/sv_somatic/annotate/main.nf +++ b/modules/local/bolt/sv_somatic/annotate/main.nf @@ -2,7 +2,7 @@ process BOLT_SV_SOMATIC_ANNOTATE { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17-snpeff' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20-snpeff' input: tuple val(meta), path(sv_vcf), path(cnv_tsv) diff --git a/modules/local/bolt/sv_somatic/prioritise/main.nf b/modules/local/bolt/sv_somatic/prioritise/main.nf index 710cb192..4f78b51b 100644 --- a/modules/local/bolt/sv_somatic/prioritise/main.nf +++ b/modules/local/bolt/sv_somatic/prioritise/main.nf @@ -2,7 +2,7 @@ process BOLT_SV_SOMATIC_PRIORITISE { tag "${meta.id}" label 'process_low' - container 'ghcr.io/umccr/bolt:0.2.17' + container 'ghcr.io/umccr/bolt:0.3.0-dev-20' input: tuple val(meta), path(sv_vcf) diff --git a/modules/local/custom/extract_tarball/main.nf b/modules/local/custom/extract_tarball/main.nf new file mode 100644 index 00000000..1ebc1011 --- /dev/null +++ b/modules/local/custom/extract_tarball/main.nf @@ -0,0 +1,33 @@ +process CUSTOM_EXTRACTTARBALL { + label 'process_single' + + conda "conda-forge::tar=1.34" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'quay.io/nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(tarball) + + output: + path "${meta.id}/", emit: extracted_dir + path '.command.*', emit: command_files + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def strip = meta.strip_components != null ? meta.strip_components : 1 + def target = meta.subdir ? "${meta.id}/${meta.subdir}" : "${meta.id}" + + """ + mkdir -p ${target} + tar ${args} -xzvf ${tarball} --strip-components ${strip} -C ${target}/ + """ + + stub: + """ + mkdir -p ${meta.id}/ + """ +} diff --git a/modules/local/pave/somatic/meta.yml b/modules/local/pave/somatic/meta.yml index 97aa6729..552537c4 100644 --- a/modules/local/pave/somatic/meta.yml +++ b/modules/local/pave/somatic/meta.yml @@ -12,7 +12,7 @@ tools: description: Annotates small variant VCF with gene, transcript coding and protein effects. homepage: https://github.com/hartwigmedical/hmftools/tree/master/pave documentation: https://github.com/hartwigmedical/hmftools/tree/master/pave - licence: ['GPL v3'] + licence: ["GPL v3"] input: - meta: type: map diff --git a/modules/local/sigrap/chord/main.nf b/modules/local/sigrap/chord/main.nf new file mode 100644 index 00000000..44810f60 --- /dev/null +++ b/modules/local/sigrap/chord/main.nf @@ -0,0 +1,37 @@ +process SIGRAP_CHORD { + tag "${meta.id}" + label 'process_low' + + container 'docker.io/qclayssen/sigrap:0.2.0-dev-7' + + input: + tuple val(meta), path(chord_prediction_tsv) + + output: + tuple val(meta), path('chord.json.gz') , emit: chord_json + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + sigrap.R chord \\ + --sample ${meta.id} \\ + --chord ${chord_prediction_tsv} \\ + --out chord.json.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sigrap: \$(sigrap.R --version | sed 's/^.*version //') + END_VERSIONS + """ + + stub: + """ + touch chord.json.gz + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/sigrap/hrdetect/main.nf b/modules/local/sigrap/hrdetect/main.nf new file mode 100644 index 00000000..8cb5fa57 --- /dev/null +++ b/modules/local/sigrap/hrdetect/main.nf @@ -0,0 +1,39 @@ +process SIGRAP_HRDETECT { + tag "${meta.id}" + label 'process_low' + + container 'docker.io/qclayssen/sigrap:0.2.0-dev-7' + + input: + tuple val(meta), path(smlv_somatic_vcf), path(sv_somatic_vcf), path(cnv_somatic_tsv) + + output: + tuple val(meta), path('hrdetect.json.gz') , emit: hrdetect_json + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + sigrap.R hrdetect \\ + --sample ${meta.id} \\ + --snv ${smlv_somatic_vcf} \\ + --sv ${sv_somatic_vcf} \\ + --cnv ${cnv_somatic_tsv} \\ + --out hrdetect.json.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sigrap: \$(sigrap.R --version | sed 's/^.*version //') + END_VERSIONS + """ + + stub: + """ + touch hrdetect.json.gz + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/sigrap/mutpat/main.nf b/modules/local/sigrap/mutpat/main.nf new file mode 100644 index 00000000..b4b34102 --- /dev/null +++ b/modules/local/sigrap/mutpat/main.nf @@ -0,0 +1,41 @@ +process SIGRAP_MUTPAT { + tag "${meta.id}" + label 'process_low' + + container 'docker.io/qclayssen/sigrap:0.2.0-dev-7' + + input: + tuple val(meta), path(smlv_somatic_vcf) + + output: + tuple val(meta), path('mutpat/') , emit: mutpat_output + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + sigrap.R mutpat \\ + --sample ${meta.id} \\ + --snv ${smlv_somatic_vcf} \\ + --rainfall \\ + --strand-bias \\ + --predefined-dbs-mbs \\ + --out mutpat/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sigrap: \$(sigrap.R --version | sed 's/^.*version //') + END_VERSIONS + """ + + stub: + """ + mkdir -p sigrap/mutpat/ + touch sigrap/mutpat/stub_output + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + """ +} diff --git a/modules/local/vcf2maf/main.nf b/modules/local/vcf2maf/main.nf new file mode 100644 index 00000000..707f7510 --- /dev/null +++ b/modules/local/vcf2maf/main.nf @@ -0,0 +1,50 @@ +process VCF2MAF { + tag "${meta.id}" + label 'process_medium' + + container 'quay.io/biocontainers/vcf2maf:1.6.22--hdfd78af_0' + + input: + tuple val(meta), path(vcf) + path genome_fasta + + + output: + tuple val(meta), path("*.maf"), emit: maf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + gunzip -c ${vcf} > ${meta.id}-temp.vcf + + vcf2maf.pl \\ + --inhibit-vep \\ + --input-vcf ${meta.id}-temp.vcf \\ + --output-maf ${meta.id}.maf \\ + --ref-fasta ${genome_fasta} \\ + --tumor-id ${meta.tumor_id} \\ + --normal-id ${meta.normal_id} \\ + --ncbi-build "GRCh38" \\ + ${args} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vcf2maf: \$(vcf2maf.pl --help | grep -o 'vcf2maf [0-9.]*' | sed 's/vcf2maf //' || echo "1.6.22") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.maf + + echo -e '${task.process}:\\n stub: noversions\\n' > versions.yml + END_VERSIONS + """ +} diff --git a/nextflow.config b/nextflow.config index 00fc8aaf..479336a7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,6 +25,7 @@ params { validate_params = true show_hidden_params = false schema_ignore_params = 'hmfdata_paths,umccrdata_paths,miscdata_paths,data_versions,genome' + pcgr_variant_chunk_size = null // Max resource options // Defaults only, expecting to be overwritten @@ -143,6 +144,7 @@ env { R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" + XDG_CACHE_HOME = "/tmp/quarto_cache_home" } // Capture exit codes from upstream processes when piping @@ -174,7 +176,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=22.10.6' version = '0.6.3' - doi = '' + doi = 'https://doi.org/10.5281/zenodo.15833493' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 47c839df..3d34050c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,10 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "outdir" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", @@ -122,14 +119,7 @@ "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -179,6 +169,12 @@ "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "pcgr_variant_chunk_size": { + "type": "integer", + "description": "Override maximum variants per PCGR chunk for the BOLT somatic annotate step.", + "help_text": "Leave unset to use the default chunk size defined by BOLT. Provide an integer to adjust PCGR chunking behaviour when needed.", + "fa_icon": "fas fa-database" } } } diff --git a/pipeline_template.yml b/pipeline_template.yml index c8064235..9e816eb2 100644 --- a/pipeline_template.yml +++ b/pipeline_template.yml @@ -1,7 +1,7 @@ prefix: umccr skip: -- github -- ci -- github_badges -- igenomes -- nf_core_configs + - github + - ci + - github_badges + - igenomes + - nf_core_configs diff --git a/subworkflows/local/prepare_input.nf b/subworkflows/local/prepare_input.nf index 244ed699..ac3993c2 100644 --- a/subworkflows/local/prepare_input.nf +++ b/subworkflows/local/prepare_input.nf @@ -124,6 +124,13 @@ workflow PREPARE_INPUT { return [meta, virusbreakend_dir] } + // CHORD: homologous recombination deficiency prediction + // channel: [ meta, chord_prediction_tsv ] + ch_chord = ch_metas.map { meta -> + def base = file(meta.oncoanalyser_dir).toUriString() + return [meta, "${base}/chord/${meta.tumor_id}.chord.prediction.tsv"] + } + // HRD: homologous recombination deficiency scores // channel: [ meta, hrdscore_csv ] ch_input_hrd = ch_metas.map { meta -> @@ -173,6 +180,7 @@ workflow PREPARE_INPUT { cobalt = ch_cobalt // channel: [ meta, cobalt_dir ] sage_somatic = ch_sage_somatic // channel: [ meta, sage_somatic_vcf, sage_somatic_tbi ] virusbreakend = ch_virusbreakend // channel: [ meta, virusbreakend_dir ] + chord = ch_chord // channel: [ meta, chord_prediction_tsv ] call_inputs = ch_call_inputs // channel: [ meta_esvee, esvee_ref_depth_vcf, esvee_prep_dir ] // DRAGEN channels diff --git a/subworkflows/local/prepare_reference.nf b/subworkflows/local/prepare_reference.nf index 7ef91a5a..26115d7d 100644 --- a/subworkflows/local/prepare_reference.nf +++ b/subworkflows/local/prepare_reference.nf @@ -2,6 +2,8 @@ // Prepare reference data as required // +include { CUSTOM_EXTRACTTARBALL as DECOMP_MISC_DATA } from '../../modules/local/custom/extract_tarball/main' + workflow PREPARE_REFERENCE { take: @@ -18,6 +20,29 @@ workflow PREPARE_REFERENCE { ch_umccr_data = createDataMap(params.umccrdata_paths, umccr_reference_data_path) ch_misc_data = createDataMap(params.miscdata_paths, params.ref_data_path) + // + // Extract tarball resources (e.g. PCGR data, VEP cache) when provided as .tar.gz/.tgz + // + misc_tarball_inputs = getTarballInputs(params.miscdata_paths, params.ref_data_path) + if (misc_tarball_inputs) { + ch_misc_data_inputs = Channel.fromList(misc_tarball_inputs) + + DECOMP_MISC_DATA(ch_misc_data_inputs) + + ch_misc_data_extracted = DECOMP_MISC_DATA.out.extracted_dir + .collect() + .map { dir_list -> + // Convert list of directories to a map of [name: dir] + def extracted_map = dir_list.collectEntries { dir -> + [(dir.getFileName().toString()): dir] + } + // Merge extracted data with existing misc_data map + return createDataMap(params.miscdata_paths, params.ref_data_path) + extracted_map + } + + ch_misc_data = ch_misc_data_extracted + } + // // Prepare genome paths and info // @@ -45,6 +70,34 @@ def createDataMap(entries, ref_data_base_path) { } } +def getTarballInputs(entries, ref_data_base_path) { + return entries + .findAll { name, relpath -> + if (!relpath) { + return false + } + def rel = relpath.toString() + return rel.endsWith('.tar.gz') || rel.endsWith('.tgz') + } + .collect { name, relpath -> + def tarball = joinPath(ref_data_base_path, relpath) + def meta + if (name == 'vep_dir') { + // VEP cache: strip wrapper dir, extract into homo_sapiens subdir + // Result: vep_dir/homo_sapiens/113_GRCh38/ + meta = [id: name, strip_components: 1, subdir: 'homo_sapiens'] + } else if (name == 'pcgr_dir') { + // PCGR bundle: don't strip, tarball contains data/ directory + // Result: pcgr_dir/data/grch38/ + meta = [id: name, strip_components: 0] + } else { + // Default: strip top-level wrapper directory + meta = [id: name, strip_components: 1] + } + return [meta, tarball] + } +} + def joinPath(a, b) { def a_noslash = file(a).toUriString().replaceAll('/$', '') return file("${a_noslash}/${b}", checkIfExists: true) diff --git a/workflows/sash.nf b/workflows/sash.nf index eb1bbb72..bad9c60d 100644 --- a/workflows/sash.nf +++ b/workflows/sash.nf @@ -42,13 +42,17 @@ include { BOLT_SMLV_GERMLINE_PREPARE } from '../modules/local/bolt/smlv_germline include { BOLT_SMLV_GERMLINE_REPORT } from '../modules/local/bolt/smlv_germline/report/main' include { BOLT_SMLV_SOMATIC_ANNOTATE } from '../modules/local/bolt/smlv_somatic/annotate/main' include { BOLT_SMLV_SOMATIC_FILTER } from '../modules/local/bolt/smlv_somatic/filter/main' -include { BOLT_SMLV_SOMATIC_REPORT } from '../modules/local/bolt/smlv_somatic/report/main' include { BOLT_SMLV_SOMATIC_RESCUE } from '../modules/local/bolt/smlv_somatic/rescue/main' +include { BOLT_SMLV_SOMATIC_REPORT } from '../modules/local/bolt/smlv_somatic/report/main' include { BOLT_SV_SOMATIC_ANNOTATE } from '../modules/local/bolt/sv_somatic/annotate/main' include { BOLT_SV_SOMATIC_PRIORITISE } from '../modules/local/bolt/sv_somatic/prioritise/main' -include { ESVEE_CALL } from '../modules/local/esvee/call/main' include { PAVE_SOMATIC } from '../modules/local/pave/somatic/main' +include { SIGRAP_HRDETECT } from '../modules/local/sigrap/hrdetect/main' +include { SIGRAP_MUTPAT } from '../modules/local/sigrap/mutpat/main' +include { VCF2MAF } from '../modules/local/vcf2maf/main' + +include { ESVEE_CALL } from '../modules/local/esvee/call/main' include { LINX_ANNOTATION } from '../subworkflows/local/linx_annotation' include { LINX_PLOTTING } from '../subworkflows/local/linx_plotting' include { PREPARE_INPUT } from '../subworkflows/local/prepare_input' @@ -74,13 +78,6 @@ workflow SASH { // channel: [ versions.yml ] ch_versions = Channel.empty() - - - - // - // Prepare inputs from samplesheet - // - PREPARE_INPUT( file(params.input), ) @@ -93,6 +90,7 @@ workflow SASH { ch_sage_somatic = PREPARE_INPUT.out.sage_somatic // channel: [ meta, sage_somatic_vcf, sage_somatic_tbi ] ch_virusbreakend = PREPARE_INPUT.out.virusbreakend // channel: [ meta, virusbreakend_dir ] ch_call_inputs = PREPARE_INPUT.out.call_inputs // channel: [ meta_esvee, esvee_ref_depth_vcf, esvee_prep_dir ] + ch_chord = PREPARE_INPUT.out.chord // channel: [ meta, chord_prediction_tsv ] // DRAGEN inputs ch_input_hrd = PREPARE_INPUT.out.hrd // channel: [ meta, hrdscore_csv ] @@ -120,8 +118,7 @@ workflow SASH { // Somatic small variants // - - + // Prepare rescue inputs with meta transformation // channel: [ meta_bolt, dragen_somatic_vcf, dragen_somatic_tbi, sage_somatic_vcf, sage_somatic_tbi ] ch_smlv_somatic_rescue_inputs = WorkflowSash.groupByMeta( ch_input_vcf_somatic, @@ -152,6 +149,7 @@ workflow SASH { umccr_data.annotations_dir, misc_data.pon_dir, misc_data.pcgr_dir, + misc_data.vep_dir ) ch_versions = ch_versions.mix(BOLT_SMLV_SOMATIC_ANNOTATE.out.versions) @@ -162,6 +160,7 @@ workflow SASH { ch_versions = ch_versions.mix(BOLT_SMLV_SOMATIC_FILTER.out.versions) + // Restore meta and create clean outputs // channel: [ meta, smlv_somatic_vcf ] ch_smlv_somatic_out = WorkflowSash.restoreMeta(BOLT_SMLV_SOMATIC_FILTER.out.vcf, ch_inputs) .map { meta, vcf, tbi -> [meta, vcf] } @@ -181,12 +180,42 @@ workflow SASH { hmf_data.gnomad_resource, ) + ch_versions = ch_versions.mix(PAVE_SOMATIC.out.versions) + // channel: [ meta, pave_somatic_vcf ] ch_pave_somatic_out = WorkflowSash.restoreMeta(PAVE_SOMATIC.out.vcf, ch_inputs) + // + // Convert somatic VCF to MAF format + // + + // channel: [ meta_vcf2maf, smlv_somatic_vcf ] + ch_vcf2maf_inputs = ch_smlv_somatic_out.map { meta, vcf -> + def meta_vcf2maf = [ + key: meta.id, + id: meta.id, + tumor_id: meta.tumor_id, + normal_id: meta.normal_id, + ] + return [meta_vcf2maf, vcf] + } + + VCF2MAF( + ch_vcf2maf_inputs, + genome.fasta + ) + + ch_versions = ch_versions.mix(VCF2MAF.out.versions) + + // channel: [ meta, somatic_maf ] + ch_vcf2maf_out = VCF2MAF.out.maf + + + + // // Germline small variants // @@ -219,7 +248,6 @@ workflow SASH { // // Somatic structural variants - // ESVEE_CALL( @@ -304,6 +332,7 @@ workflow SASH { BOLT_SMLV_SOMATIC_REPORT( ch_smlv_somatic_report_inputs, misc_data.pcgr_dir, + misc_data.vep_dir, umccr_data.somatic_panel_regions_cds, hmf_data.sage_highconf_regions, genome.fasta, @@ -330,6 +359,7 @@ workflow SASH { ch_smlv_germline_report_inputs, umccr_data.germline_panel_genes, misc_data.pcgr_dir, + misc_data.vep_dir ) ch_versions = ch_versions.mix(BOLT_SMLV_GERMLINE_REPORT.out.versions) @@ -430,11 +460,60 @@ workflow SASH { + // + // Sigrap + // + + // channel: [ meta_sigrap, smlv_somatic_vcf, sv_somatic_vcf, cnv_somatic_tsv ] + ch_sigrap_hrdetect_inputs = WorkflowSash.groupByMeta( + ch_smlv_somatic_out, + ch_sv_somatic_sv_vcf_out, + ch_sv_somatic_cnv_tsv_out, + ) + .map { meta, smlv_vcf, sv_vcf, cnv_tsv -> + def meta_sigrap = [ + key: meta.id, + id: meta.id, + tumor_id: meta.tumor_id, + ] + return [meta_sigrap, smlv_vcf, sv_vcf, cnv_tsv] + } + + SIGRAP_HRDETECT( + ch_sigrap_hrdetect_inputs + ) + + // channel: [ meta, hrdetect_json ] + ch_sigrap_hrdetect = WorkflowSash.restoreMeta(SIGRAP_HRDETECT.out.hrdetect_json, ch_inputs) + ch_versions = ch_versions.mix(SIGRAP_HRDETECT.out.versions) + + // channel: [ meta_sigrap, smlv_somatic_vcf ] + ch_sigrap_mutpat_inputs = ch_smlv_somatic_out.map { meta, vcf -> + def meta_sigrap = [ + key: meta.id, + id: meta.id, + tumor_id: meta.tumor_id, + ] + return [meta_sigrap, vcf] + } + + SIGRAP_MUTPAT( + ch_sigrap_mutpat_inputs + ) + + // channel: [ meta, mutpat_output ] + ch_sigrap_mutpat = WorkflowSash.restoreMeta(SIGRAP_MUTPAT.out.mutpat_output, ch_inputs) + ch_versions = ch_versions.mix(SIGRAP_MUTPAT.out.versions) + + + + + // // Generate the cancer report // - // channel: [ meta_bolt, smlv_somatic_vcf, smlv_somatic_bcftools_stats, smlv_somatic_counts_process, sv_tsv, sv_vcf, cnv_tsv, af_global, af_keygenes, purple_baf_circos_plot, purple_dir, virusbreakend_dir, dragen_hrd ] + // channel: [ meta_bolt, smlv_somatic_vcf, smlv_somatic_bcftools_stats, smlv_somatic_counts_process, sv_tsv, sv_vcf, cnv_tsv, af_global, af_keygenes, purple_baf_circos_plot, purple_dir, virusbreakend_dir, dragen_hrd, mutpat, hrdetect, chord ] ch_cancer_report_inputs = WorkflowSash.groupByMeta( ch_smlv_somatic_out, ch_smlv_somatic_report_stats_out, @@ -448,6 +527,9 @@ workflow SASH { PURPLE_CALLING.out.purple_dir, ch_virusbreakend, ch_input_hrd, + ch_sigrap_mutpat, + ch_sigrap_hrdetect, + ch_chord, flatten_mode: 'nonrecursive', ) .map { @@ -478,11 +560,11 @@ workflow SASH { // channel: [ meta, somatic_dragen_dir ] ch_input_dragen_somatic_dir = ch_inputs - .map { meta -> [meta, meta.dragen_somatic_dir] } + .map { meta -> [meta, file(meta.dragen_somatic_dir)] } // channel: [ meta, germline_dragen_dir ] ch_input_dragen_germline_dir = ch_inputs - .map { meta -> [meta, meta.dragen_germline_dir] } + .map { meta -> [meta, file(meta.dragen_germline_dir)] } // channel: [ meta_multiqc, [somatic_dragen_dir, germline_dragen_dir, somatic_bcftools_stats, germline_bcftools_stats, somatic_counts_type, germline_counts_type, purple_dir] ] ch_multiqc_report_inputs = WorkflowSash.groupByMeta( @@ -518,7 +600,7 @@ workflow SASH { // - // Annotate post processed strucutral variant events + // Annotate post processed structural variant events // LINX_ANNOTATION( @@ -542,7 +624,6 @@ workflow SASH { - // // TASK: Aggregate software versions //