diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index ed577c8e..b4ca4528 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -45,9 +45,6 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2213e03..7d44b9fa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -44,6 +44,11 @@ jobs: - isMaster: false profile: "singularity" steps: + - name: Free some space + run: | + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Check out pipeline code uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 with: @@ -85,3 +90,45 @@ jobs: - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" run: | nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results + + profiles: + name: Run workflow profile + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/mag') }} + runs-on: ubuntu-latest + strategy: + matrix: + # Run remaining test profiles with minimum nextflow version + test_name: + [ + test_host_rm, + test_hybrid, + test_hybrid_host_rm, + test_busco_auto, + test_ancient_dna, + test_adapterremoval, + test_binrefinement, + test_virus_identification, + test_single_end, + test_concoct, + ] + steps: + - name: Free some space + run: | + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Clean up Disk space + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + + - name: Run pipeline with ${{ matrix.test_name }} test profile + run: | + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},docker --outdir ./results diff --git a/.nf-core.yml b/.nf-core.yml index 5ae8e4aa..21c50125 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -8,8 +8,8 @@ lint: nf_core_version: 3.1.2 repository_type: pipeline template: - author: "Hadrien Gourl\xE9, Daniel Straub, Sabrina Krakau, James A. Fellows Yates,\ - \ Maxime Borry" + author: "Hadrien Gourlé, Daniel Straub, Sabrina Krakau, James A. Fellows Yates, + Maxime Borry" description: Assembly, binning and annotation of metagenomes force: false is_nfcore: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 71be1fcf..9a37df10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,638 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v3.3.1dev - [date] +## v3.3.1dev - [unreleased] -Initial release of nf-core/mag, created with the [nf-core](https://nf-co.re/) template. +### `Added` + +### `Changed` + +### `Fixed` + +### `Dependencies` + +### `Dependencies` + +## 3.3.0 [2024-12-19] ### `Added` +- [#692](https://github.com/nf-core/mag/pull/692) - Added Nanoq as optional longread filtering tool (added by @muabnezor) +- [#692](https://github.com/nf-core/mag/pull/692) - Added chopper as optional longread filtering tool and/or phage lambda removal tool (added by @muabnezor) +- [#707](https://github.com/nf-core/mag/pull/707) - Make Bin QC a subworkflow (added by @dialvarezs) +- [#707](https://github.com/nf-core/mag/pull/707) - Added CheckM2 as an alternative bin completeness and QC tool (added by @dialvarezs) +- [#708](https://github.com/nf-core/mag/pull/708) - Added `--exclude_unbins_from_postbinning` parameter to exclude unbinned contigs from post-binning processes, speeding up Prokka in some cases (added by @dialvarezs) +- [#732](https://github.com/nf-core/mag/pull/732) - Added support for Prokka's compliance mode with `--prokka_with_compliance --prokka_compliance_centre ` (reported by @audy and @Thomieh73, added by @jfy133) + +### `Changed` + +- [#731](https://github.com/nf-core/mag/pull/731) - Updated to nf-core 3.1.0 `TEMPLATE` (by @jfy133) + ### `Fixed` +- [#707](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) +- [#724](https://github.com/nf-core/mag/pull/724) - Fix quoting in `utils_nfcore_mag_pipeline/main.nf` (added by @dialvarezs) +- [#716](https://github.com/nf-core/mag/pull/692) - Make short read processing a subworkflow (added by @muabnezor) +- [#708](https://github.com/nf-core/mag/pull/708) - Fixed channel passed as GUNC input (added by @dialvarezs) +- [#729](https://github.com/nf-core/mag/pull/729) - Fixed misspecified multi-FASTQ input for single-end data in MEGAHIT (reported by John Richards, fix by @jfy133) + ### `Dependencies` +| Tool | Previous version | New version | +| ------- | ---------------- | ----------- | +| CheckM | 1.2.1 | 1.2.3 | +| CheckM2 | | 1.0.2 | +| chopper | | 0.9.0 | +| GUNC | 1.0.5 | 1.0.6 | +| nanoq | | 0.10.0 | + ### `Deprecated` + +## 3.2.1 [2024-10-30] + +### `Added` + +### `Changed` + +### `Fixed` + +- [#707](https://github.com/nf-core/mag/pull/674) - Fix missing space resulting in malformed args for MEGAHIT (reported by @d4straub, fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + +## 3.2.0 [2024-10-27] + +### `Added` + +- [#674](https://github.com/nf-core/mag/pull/674) - Added `--longread_adaptertrimming_tool` Where user can chose between porechop_abi (default) and porechop (added by @muabnezor) + +### `Changed` + +- [#674](https://github.com/nf-core/mag/pull/674) - Changed to porechop-abi as default adapter trimming tool for long reads. User can still use porechop if preferred (added by @muabnezor) +- [#666](https://github.com/nf-core/mag/pull/666) - Update SPAdes to version 4.0.0, replace both METASPADES and MEGAHIT with official nf-core modules (requested by @elsherbini, fix by @jfy133) +- [#666](https://github.com/nf-core/mag/pull/666) - Update URLs to GTDB database downloads due to server move (reported by @Jokendo-collab, fix by @jfy133) +- [#695](https://github.com/nf-core/mag/pull/695) - Updated to nf-core 3.0.2 `TEMPLATE` (by @jfy133) +- [#695](https://github.com/nf-core/mag/pull/695) - Switch more stable Zenodo link for CheckM data (by @jfy133) + +### `Fixed` + +- [#674](https://github.com/nf-core/mag/pull/674) - Make longread preprocessing a subworkflow (added by @muabnezor) +- [#674](https://github.com/nf-core/mag/pull/674) - Add porechop and filtlong logs to multiqc (added by @muabnezor) +- [#674](https://github.com/nf-core/mag/pull/674) - Change local filtlong module to the official nf-core/filtlong module (added by @muabnezor) +- [#690](https://github.com/nf-core/mag/pull/690) - MaxBin2 now using the abundance information from different samples rather than an average (reported by @uel3 and fixed by @d4straub) +- [#698](https://github.com/nf-core/mag/pull/698) - Updated prodigal module to not pick up input symlinks for compression causing pigz errors (reported by @zackhenny, fix by @jfy133 ) + +### `Dependencies` + +| Tool | Previous version | New version | +| ------------ | ---------------- | ----------- | +| Porechop_ABI | | 0.5.0 | +| Filtlong | 0.2.0 | 0.2.1 | +| SPAdes | 3.15.3 | 4.0.0 | + +### `Deprecated` + +## 3.1.0 [2024-10-04] + +### `Added` + +- [#665](https://github.com/nf-core/mag/pull/648) - Add support for supplying pre-made bowtie host reference index (requested by @simone-pignotti, added by @jfy133) +- [#670](https://github.com/nf-core/mag/pull/670) - Added `--gtdbtk_pplacer_useram` to run GTDBTk in memory mode rather than write to disk (requested by @harper357, fixed by @jfy133) + +### `Changed` + +- [#664](https://github.com/nf-core/mag/pull/664) - Update GTDBTk to latest version, with updated column names, update GTDB to release 220 (by @dialvarezs) +- [#676](https://github.com/nf-core/mag/pull/676) - Added exit code 12 to valid SPAdes retry codes, due to OOM errors from spades-hammer (reported by @bawee, fix by @jfy133) + +### `Fixed` + +- [#667](https://github.com/nf-core/mag/pull/667) - Fix pipeline crashing if only CONCOCT selected during binning (reported and fixed by @jfy133) +- [#670](https://github.com/nf-core/mag/pull/670) - Re-add missing GTDBTk parameters into GTDBTk module (reported by harper357, fixed by @jfy133) +- [#672](https://github.com/nf-core/mag/pull/673) - Fix GTDB-Tk per-sample TSV files not being published in output directory (reported by @jhayer, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| ------ | ---------------- | ----------- | +| GTDBTk | 2.3.2 | 2.4.0 | + +### `Deprecated` + +- [#670](https://github.com/nf-core/mag/pull/670) - Deprecated `--gtdbtk_pplacer_scratch` due to unintuitive usage (reported by harper357, fixed by @jfy133) + +## 3.0.3 [2024-08-27] + +### `Added` + +### `Changed` + +### `Fixed` + +- [#648](https://github.com/nf-core/mag/pull/648) - Fix sample ID/assembly ID check failure when no IDs match (reported by @zackhenny, fix by @prototaxites) +- [#646](https://github.com/nf-core/mag/pull/646) - GTDB-Tk directory input now creates a value channel so it runs for all entries to the process and not just the first (reported by @amizeranschi, fix by @prototaxites). +- [#639](https://github.com/nf-core/mag/pull/639) - Fix pipeline failure when a sample produces only a single bin (fix by @d-callan) +- [#651](https://github.com/nf-core/mag/pull/651) - Replace base container for bash only modules to reduce number of containers in pipeline (reported and fixed by @harper357) +- [#652](https://github.com/nf-core/mag/pull/652) - Fix documentation typo in using user-defined assembly parameters (reported and fixed by @amizeranschi) +- [#653](https://github.com/nf-core/mag/pull/653) - Fix overwriting of per-bin 'raw' GUNC RUN output files (multi-bin summary tables not affected) (reported by @zackhenny and fixed by @jfy133) + +### `Dependencies` + +### `Deprecated` + +## 3.0.2 [2024-07-04] + +### `Added` + +### `Changed` + +- [#633](https://github.com/nf-core/mag/pull/633/) - Changed BUSCO to use offline mode when the database is specified by the user (reported by @ChristophKnapp and many others, fix by @jfy133) +- [#632](https://github.com/nf-core/mag/pull/632) - Use default NanoLyse log of just removed reads rather than custom (by @jfy133) + +### `Fixed` + +- [#630](https://github.com/nf-core/mag/pull/630) - Fix CONCOCT empty bins killing the pipeline, and allow for true multithreading again (removing OPENBLAS loop) (reported by @maxibor, fix by @maxibor and @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| -------- | ---------------- | ----------- | +| Porechop | 0.2.3_seqan2.1.1 | 0.2.4 | +| NanoPlot | 1.26.3 | 1.41.6 | +| NanoLyse | 1.1.0 | 1.2.0 | + +### `Deprecated` + +## 3.0.1 [2024-06-10] + +### `Added` + +### `Changed` + +- [#625](https://github.com/nf-core/mag/pull/625) - Updated link to geNomad database for downloading (reported by @amizeranschi, fix by @jfy133) + +### `Fixed` + +- [#618](https://github.com/nf-core/mag/pull/618) - Fix CENTRIFUGE mkfifo failures by using work directory /tmp (reported by @skrakau, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| ---------- | ---------------- | ----------- | +| Centrifuge | 1.0.4_beta | 1.0.4.1 | + +### `Deprecated` + +## 3.0.0 - [2024-05-13] + +### `Added` + +- [#615](https://github.com/nf-core/mag/pull/615) - Add new logo (by @jfy133) + +### `Changed` + +- [#599](https://github.com/nf-core/mag/pull/599) - Update to nf-core v2.13.1 `TEMPLATE` (by @jfy133) +- [#614](https://github.com/nf-core/mag/pull/614) - Update to nf-core v2.14.1 `TEMPLATE` (by @jfy133) + +### `Fixed` + +- [#606](https://github.com/nf-core/mag/pull/606) - Prevent pipeline crash when premade mashdb given to or no alignments found with GTDB-TK_CLASSIFYWF (reported by @cedwardson4, fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + +- [#599](https://github.com/nf-core/mag/pull/599) - Direct reads input (`--input 'sample_{R1,R2}.fastq.gz'`) is no longer supported, all input must come via samplesheets (by @jfy133) + +## 2.5.4 - [2024-02-12] + +### `Added` + +### `Changed` + +- [#581](https://github.com/nf-core/mag/pull/581) - Added explicit licence text to headers of all custom scripts (reported by @FriederikeHanssen and @maxibor, fix by @jfy133) +- [#602](https://github.com/nf-core/mag/pull/602) - Co-binning when using aDNA mode now enabled (added by @maxibor) + +### `Fixed` + +- [#583](https://github.com/nf-core/mag/pull/583) - Fix GTDB database input when directory supplied (fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + +## 2.5.3 - [2024-02-05] + +### `Added` + +### `Changed` + +- [#575](https://github.com/nf-core/mag/pull/575) - Deactivated MetaSPAdes, Centrifuge, and GTDB in test_full profile due to some container incompatibilities in nf-core megatest AWS configurations (by @jfy133) + +### `Fixed` + +- [#574](https://github.com/nf-core/mag/pull/574) - Fix wrong channel going to BIN_SUMMARY (fix by @maxibor) + +### `Dependencies` + +### `Deprecated` + +## 2.5.2 - [2024-02-02] + +### `Added` + +- [#562](https://github.com/nf-core/mag/pull/562) - Add CAT summary into the global bin_summary (by @maxibor) +- [#565](https://github.com/nf-core/mag/pull/565) - Add warning of empty GTDB-TK results if no contigs pass completeness filter (by @jfy133 and @maxibor) + +### `Changed` + +- [#563](https://github.com/nf-core/mag/pull/562) - Update to nf-core v2.12 `TEMPLATE` (by @CarsonJM) +- [#566](https://github.com/nf-core/mag/pull/566) - More logical ordering of MultiQC sections (assembly and bin sections go together respectively) (fix by @jfy133) + +### `Fixed` + +- [#548](https://github.com/nf-core/mag/pull/548) - Fixes to (reported by @maxibor, @PPpissar, @muniheart, @llborcard, fix by @maxibor) + - GTDBK-TK execution + - CAT/QUAST/DEPTH bin summary file name collisions + - BUSCO database parsing + - Correct CAT name files +- [#558](https://github.com/nf-core/mag/pull/558) - Fix bug in run merging when dealing with single end data (reported by @roberta-davidson, fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + +## 2.5.1 - [2023-11-17] + +### `Added` + +### `Changed` + +### `Fixed` + +- [#489](https://github.com/nf-core/mag/pull/489) - Fix file name collision clashes for CHECKM, CAT, GTDBTK, and QUAST (reported by @tillenglert and @maxibor, fix by @maxibor) +- [#533](https://github.com/nf-core/mag/pull/533) - Fix glob pattern for publishing MetaBAT2 bins in results (reported by @patriciatran, fix by @jfy133) +- [#535](https://github.com/nf-core/mag/pull/535) - Fix input validation pattern to again allow direct FASTQ input (reported by @lennijusten, @emnilsson, fix by @jfy133, @d4straub, @mahesh-panchal, @nvnieuwk) + +### `Dependencies` + +| Tool | Previous version | New version | +| ---- | ---------------- | ----------- | +| CAT | 4.6 | 5.2.3 | + +### `Deprecated` + +- [#536](https://github.com/nf-core/mag/pull/536) - Remove custom function with native Nextflow for checking file extension (reported by @d4straub, fix by @jfy133) + +## 2.5.0 - [2023-10-10] + +### `Added` + +- [#504](https://github.com/nf-core/mag/pull/504) - New parameters `--busco_db`, `--kraken2_db`, and `--centrifuge_db` now support directory input of a pre-uncompressed database archive directory (by @gregorysprenger). + +### `Changed` + +- [#511](https://github.com/nf-core/mag/pull/511) - Update to nf-core 2.10 `TEMPLATE` (by @jfy133) +- [#504](https://github.com/nf-core/mag/pull/504) - `--save_busco_reference` is now replaced by `--save_busco_db` (by @gregorysprenger). + +### `Fixed` + +- [#514](https://github.com/nf-core/mag/pull/514) - Fix missing CONCOCT files in downstream output (reported by @maxibor, fix by @jfy133) +- [#515](https://github.com/nf-core/mag/pull/515) - Fix overwriting of GUNC output directories when running with domain classification (reported by @maxibor, fix by @jfy133) +- [#516](https://github.com/nf-core/mag/pull/516) - Fix edge-case bug where MEGAHIT re-uses previous work directory on resume and fails (reported by @husensofteng, fix by @prototaxites) +- [#520](https://github.com/nf-core/mag/pull/520) - Fix missing Tiara output files (fix by @jfy133) +- [#522](https://github.com/nf-core/mag/pull/522) - Fix 'nulls' in depth plot PNG files (fix by @jfy133) + +### `Dependencies` + +### `Deprecated` + +- [#504](https://github.com/nf-core/mag/pull/504) - `--busco_reference`, `--busco_download_path`, `--save_busco_reference` parameters have been deprecated and replaced with new parameters (by @gregorysprenger). + +## 2.4.0 - 2023-09-26 + +### `Added` + +- [#497](https://github.com/nf-core/mag/pull/497) - Adds support for pointing at a local db for krona, using the parameter `--krona_db` (by @willros). +- [#395](https://github.com/nf-core/mag/pull/395) - Adds support for fast domain-level classification of bins using Tiara, to allow bins to be separated into eukaryotic and prokaryotic-specific processes. +- [#422](https://github.com/nf-core/mag/pull/422) - Adds support for normalization of read depth with BBNorm (added by @erikrikarddaniel and @fabianegli) +- [#439](https://github.com/nf-core/mag/pull/439) - Adds ability to enter the pipeline at the binning stage by providing a CSV of pre-computed assemblies (by @prototaxites) +- [#459](https://github.com/nf-core/mag/pull/459) - Adds ability to skip damage correction step in the ancient DNA workflow and just run pyDamage (by @jfy133) +- [#364](https://github.com/nf-core/mag/pull/364) - Adds geNomad nf-core modules for identifying viruses in assemblies (by @PhilPalmer and @CarsonJM) +- [#481](https://github.com/nf-core/mag/pull/481) - Adds MetaEuk for annotation of eukaryotic MAGs, and MMSeqs2 to enable downloading databases for MetaEuk (by @prototaxites) +- [#437](https://github.com/nf-core/mag/pull/429) - `--gtdb_db` also now supports directory input of an pre-uncompressed GTDB archive directory (reported by @alneberg, fix by @jfy133) +- [#494](https://github.com/nf-core/mag/pull/494) - Adds support for saving the BAM files from Bowtie2 mapping of input reads back to assembly (fix by @jfy133) + +### `Changed` + +- [#428](https://github.com/nf-core/mag/pull/428) [#467](https://github.com/nf-core/mag/pull/467) - Update to nf-core 2.8, 2.9 `TEMPLATE` (by @jfy133) +- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133) +- [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133). +- [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133). +- [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133) +- [#477](https://github.com/nf-core/mag/pull/477) - `--gtdb` parameter is split into `--skip_gtdbtk` and `--gtdb_db` to allow finer control over GTDB database retrieval (fix by @jfy133) +- [#500](https://github.com/nf-core/mag/pull/500) - Temporarily disabled downstream processing of both refined and raw bins due to bug (by @jfy133) + +### `Fixed` + +- [#496](https://github.com/nf-core/mag/pull/496) - Fix help text for paramters `--bowtie2_mode`, `spades_options` and `megahit_options` (by @willros) +- [#400](https://github.com/nf-core/mag/pull/400) - Fix duplicated Zenodo badge in README (by @jfy133) +- [#406](https://github.com/nf-core/mag/pull/406) - Fix CheckM database always downloading, regardless if CheckM is selected (by @jfy133) +- [#419](https://github.com/nf-core/mag/pull/419) - Fix bug with busco_clean parameter, where it is always activated (by @prototaxites) +- [#426](https://github.com/nf-core/mag/pull/426) - Fixed typo in help text for parameters `--host_genome` and `--host_fasta` (by @tillenglert) +- [#434](https://github.com/nf-core/mag/pull/434) - Fix location of samplesheet for AWS full tests (reported by @Lfulcrum, fix by @jfy133) +- [#438](https://github.com/nf-core/mag/pull/438) - Fixed version inconsistency between conda and containers for GTDBTK_CLASSIFYWF (by @jfy133) +- [#439](https://github.com/nf-core/mag/pull/445) - Fix bug in assembly input (by @prototaxites) +- [#447](https://github.com/nf-core/mag/pull/447) - Remove `default: None` from parameter schema (by @drpatelh) +- [#449](https://github.com/nf-core/mag/pull/447) - Fix results file overwriting in Ancient DNA workflow (reported by @alexhbnr, fix by @jfy133) +- [#470](https://github.com/nf-core/mag/pull/470) - Fix binning preparation from running even when binning was requested to be skipped (reported by @prototaxites, fix by @jfy133) +- [#480](https://github.com/nf-core/mag/pull/480) - Improved `-resume` reliability through better meta map preservation (reported by @prototaxites, fix by @jfy133) +- [#493](https://github.com/nf-core/mag/pull/493) - Update `METABAT2` nf-core module so that it reduced the number of unnecessary file moves, enabling virtual filesystems (fix by @adamrtalbot) +- [#500](https://github.com/nf-core/mag/pull/500) - Fix MaxBin2 bins not being saved in results directly properly (reported by @Perugolate, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| -------- | ---------------- | ----------- | +| BCFtools | 1.16 | 1.17 | +| SAMtools | 1.16.1 | 1.17 | +| fastp | 0.23.2 | 0.23.4 | +| MultiQC | 1.14 | 1.15 | + +## v2.3.2 - [2023-06-23] + +### `Fixed` + +- [#461](https://github.com/nf-core/mag/pull/461) - Fix full-size AWS test profile paths (by @jfy133) +- [#461](https://github.com/nf-core/mag/pull/461) - Fix pyDamage results being overwritten (reported by @alexhbnr, fix by @jfy133) + +## v2.3.1 - [2023-06-19] + +### `Fixed` + +- [#458](https://github.com/nf-core/mag/pull/458) - Correct the major issue in ancient DNA workflow of binning refinement being performed on uncorrected contigs instead of aDNA consensus recalled contigs (issue [#449](https://github.com/nf-core/mag/issues/449)) +- [#451](https://github.com/nf-core/mag/pull/451) - Fix results file overwriting in Ancient DNA workflow (reported by @alexhbnr, fix by @jfy133, and integrated by @maxibor in [#458](https://github.com/nf-core/mag/pull/458) ) + +## v2.3.0 - [2023/03/02] + +### `Added` + +- [#350](https://github.com/nf-core/mag/pull/350) - Adds support for CheckM as alternative bin completeness and QC tool (added by @jfy133 and @skrakau) +- [#353](https://github.com/nf-core/mag/pull/353) - Added the busco_clean parameter to optionally clean each BUSCO directory after a successful (by @prototaxites) +- [#361](https://github.com/nf-core/mag/pull/361) - Added the skip_clipping parameter to skip read preprocessing with fastp or adapterremoval. Running the pipeline with skip_clipping, keep_phix and without specifying a host genome or fasta file skips the FASTQC_TRIMMED process (by @prototaxites) +- [#365](https://github.com/nf-core/mag/pull/365) - Added CONCOCT as an additional (optional) binning tool (by @jfy133) +- [#366](https://github.com/nf-core/mag/pull/366) - Added CAT_SUMMARISE process and cat_official_taxonomy parameter (by @prototaxites) +- [#372](https://github.com/nf-core/mag/pull/372) - Allow CAT_DB to take an extracted database as well as a tar.gz file (by @prototaxites). +- [#380](https://github.com/nf-core/mag/pull/380) - Added support for saving processed reads (clipped, host removed etc.) to results directory (by @jfy133) +- [#394](https://github.com/nf-core/mag/pull/394) - Added GUNC for additional chimeric bin/contamination QC (added by @jfy133) + +### `Changed` + +- [#340](https://github.com/nf-core/mag/pull/340),[#368](https://github.com/nf-core/mag/pull/368),[#373](https://github.com/nf-core/mag/pull/373) - Update to nf-core 2.7.2 `TEMPLATE` (by @jfy133, @d4straub, @skrakau) +- [#373](https://github.com/nf-core/mag/pull/373) - Removed parameter `--enable_conda`. Updated local modules to new conda syntax and updated nf-core modules (by @skrakau) +- [#385](https://github.com/nf-core/mag/pull/385) - CAT also now runs on unbinned contigs as well as binned contigs (added by @jfy133) +- [#399](https://github.com/nf-core/mag/pull/399/files) - Removed undocumented BUSCO_PLOT process (previously generated `*.busco_figure.png` plots unsuitable for metagenomics) (by @skrakau). +- [#416](https://github.com/nf-core/mag/pull/416) - Use GTDBTK_CLASSIFYWF nf-core module instead of local module (added by @alxndrdiaz) + +### `Fixed` + +- [#345](https://github.com/nf-core/mag/pull/345) - Bowtie2 mode changed to global alignment for ancient DNA mode (`--very-sensitive` mode) to prevent soft clipping at the end of reads when running in local mode. (by @maxibor) +- [#349](https://github.com/nf-core/mag/pull/349) - Add a warning that pipeline will reset minimum contig size to 1500 specifically MetaBAT2 process, if a user supplies below this threshold. (by @jfy133) +- [#352](https://github.com/nf-core/mag/pull/352) - Escape the case in the BUSCO module that BUSCO can just detect a root lineage but is not able to find any marker genes (by @alexhbnr) +- [#355](https://github.com/nf-core/mag/pull/355) - Include error code 21 for retrying with higher memory for SPAdes and hybridSPAdes (by @mglubber) + +### `Dependencies` + +| Tool | Previous version | New version | +| --------- | ---------------- | ----------- | +| BUSCO | 5.1.0 | 5.4.3 | +| BCFtools | 1.14 | 1.16 | +| Freebayes | 1.3.5 | 1.3.6 | +| SAMtools | 1.15 | 1.16.1 | + +## v2.2.1 - 2022/08/25 + +### `Added` + +### `Changed` + +### `Fixed` + +- [#328](https://github.com/nf-core/mag/pull/328) - Fix too many symbolic links issue in local convert_depths module (reported by @ChristophKnapp and fixed by @apeltzer, @jfy133) +- [#329](https://github.com/nf-core/mag/pull/329) - Each sample now gets it's own result directory for PyDamage analysis and filter (reported and fixed by @maxibor) + +### `Dependencies` + +## v2.2.0 - 2022/06/14 + +### `Added` + +- [#263](https://github.com/nf-core/mag/pull/263) - Restructure binning subworkflow in preparation for aDNA workflow and extended binning +- [#247](https://github.com/nf-core/mag/pull/247) - Add ancient DNA subworkflow +- [#263](https://github.com/nf-core/mag/pull/263) - Add MaxBin2 as second contig binning tool +- [#285](https://github.com/nf-core/mag/pull/285) - Add AdapterRemoval2 as an alternative read trimmer +- [#291](https://github.com/nf-core/mag/pull/291) - Add DAS Tool for bin refinement +- [#319](https://github.com/nf-core/mag/pull/319) - Activate pipeline-specific institutional nf-core/configs + +### `Changed` + +- [#269](https://github.com/nf-core/mag/pull/269),[#283](https://github.com/nf-core/mag/pull/283),[#289](https://github.com/nf-core/mag/pull/289),[#302](https://github.com/nf-core/mag/pull/302) - Update to nf-core 2.4 `TEMPLATE` +- [#286](https://github.com/nf-core/mag/pull/286) - Cite our publication instead of the preprint +- [#291](https://github.com/nf-core/mag/pull/291), [#299](https://github.com/nf-core/mag/pull/299) - Add extra results folder `GenomeBinning/depths/contigs` for `[assembler]-[sample/group]-depth.txt.gz`, and `GenomeBinning/depths/bins` for `bin_depths_summary.tsv` and `[assembler]-[binner]-[sample/group]-binDepths.heatmap.png` +- [#315](https://github.com/nf-core/mag/pull/315) - Replace base container for standard shell tools to fix problems with running on Google Cloud + +### `Fixed` + +- [#290](https://github.com/nf-core/mag/pull/290) - Fix caching of binning input +- [#305](https://github.com/nf-core/mag/pull/305) - Add missing Bowtie2 version for process `BOWTIE2_PHIX_REMOVAL_ALIGN` to `software_versions.yml` +- [#307](https://github.com/nf-core/mag/pull/307) - Fix retrieval of GTDB-Tk version (note about newer version caused error in `CUSTOM_DUMPSOFTWAREVERSIONS`) +- [#309](https://github.com/nf-core/mag/pull/309) - Fix publishing of BUSCO `busco_downloads/` folder, i.e. publish only when `--save_busco_reference` is specified +- [#321](https://github.com/nf-core/mag/pull/321) - Fix parameter processing in `BOWTIE2_REMOVAL_ALIGN` (which was erroneously for `BOWTIE2_PHIX_REMOVAL_ALIGN`) + +### `Dependencies` + +| Tool | Previous version | New version | +| ------- | ---------------- | ----------- | +| fastp | 0.20.1 | 0.23.2 | +| MultiQC | 1.11 | 1.12 | + +## v2.1.1 - 2021/11/25 + +### `Added` + +- [#240](https://github.com/nf-core/mag/pull/240) - Add prodigal to predict protein-coding genes for assemblies. +- [#241](https://github.com/nf-core/mag/pull/241) - Add parameter `--skip_prodigal`. +- [#244](https://github.com/nf-core/mag/pull/244) - Add pipeline preprint information. +- [#245](https://github.com/nf-core/mag/pull/245) - Add Prokka to annotate binned genomes. + +### `Changed` + +- [#249](https://github.com/nf-core/mag/pull/249) - Update workflow overview figure. +- [#258](https://github.com/nf-core/mag/pull/258) - Updated MultiQC 1.9 to 1.11. +- [#260](https://github.com/nf-core/mag/pull/260) - Updated SPAdes 3.13.1 -> 3.15.3, MEGAHIT 1.2.7 -> 1.2.7 + +### `Fixed` + +- [#256](https://github.com/nf-core/mag/pull/256) - Fix `--skip_busco`. +- [#236](https://github.com/nf-core/mag/pull/236) - Fix large assemblies (> 4 billion nucleotides in length). +- [#254](https://github.com/nf-core/mag/pull/254) - Fix MetaBAT2 error with nextflow version 21.10.x (21.04.03 is the latest functional version for nf-core/mag 2.1.0). +- [#255](https://github.com/nf-core/mag/pull/255) - Update gtdbtk conda channel. +- [#258](https://github.com/nf-core/mag/pull/258) - FastP results are now in MultiQC. + +## v2.1.0 - 2021/07/29 + +### `Added` + +- [#212](https://github.com/nf-core/mag/pull/212), [#214](https://github.com/nf-core/mag/pull/214) - Add bin abundance estimation based on median sequencing depths of corresponding contigs (results are written to `results/GenomeBinning/bin_depths_summary.tsv` and `results/GenomeBinning/bin_summary.tsv`) [#197](https://github.com/nf-core/mag/issues/197). +- [#214](https://github.com/nf-core/mag/pull/214) - Add generation of (clustered) heat maps with bin abundances across samples (using centered log-ratios) +- [#217](https://github.com/nf-core/mag/pull/217) - Publish genes predicted with Prodigal within BUSCO run (written to `results/GenomeBinning/QC/BUSCO/[assembler]-[bin]_prodigal.gff`). + +### `Changed` + +- [#218](https://github.com/nf-core/mag/pull/218) - Update to nf-core 2.0.1 `TEMPLATE` (DSL2) + +### `Fixed` + +- [#226](https://github.com/nf-core/mag/pull/226) - Fix handling of `BUSCO` output when run in auto lineage selection mode and selected specific lineage is the same as the generic one. + +## v2.0.0 - 2021/06/01 + +### `Added` + +- [#179](https://github.com/nf-core/mag/pull/179) - Add BUSCO automated lineage selection functionality (new default). The pameter `--busco_auto_lineage_prok` can be used to only consider prokaryotes and the parameter `--busco_download_path` to run BUSCO in `offline` mode. +- [#178](https://github.com/nf-core/mag/pull/178) - Add taxonomic bin classification with `GTDB-Tk` `v1.5.0` (for bins filtered based on `BUSCO` QC metrics). +- [#196](https://github.com/nf-core/mag/pull/196) - Add process for CAT database creation as an alternative to using pre-built databases. + +### `Changed` + +- [#162](https://github.com/nf-core/mag/pull/162) - Switch to DSL2 +- [#162](https://github.com/nf-core/mag/pull/162) - Changed `--input` file format from `TSV` to `CSV` format, requires header now +- [#162](https://github.com/nf-core/mag/pull/162) - Update `README.md`, `docs/usage.md` and `docs/output.md` +- [#162](https://github.com/nf-core/mag/pull/162) - Update `FastP` from version `0.20.0` to `0.20.1` +- [#162](https://github.com/nf-core/mag/pull/162) - Update `Bowtie2` from version `2.3.5` to `2.4.2` +- [#162](https://github.com/nf-core/mag/pull/162) - Update `FastQC` from version `0.11.8` to `0.11.9` +- [#172](https://github.com/nf-core/mag/pull/172) - Compressed discarded MetaBAT2 output files +- [#176](https://github.com/nf-core/mag/pull/176) - Update CAT DB link +- [#179](https://github.com/nf-core/mag/pull/179) - Update `BUSCO` from version `4.1.4` to `5.1.0` +- [#179](https://github.com/nf-core/mag/pull/179) - By default BUSCO now performs automated lineage selection instead of using the bacteria_odb10 lineage as reference. Specific lineage datasets can still be provided via `--busco_reference`. +- [#178](https://github.com/nf-core/mag/pull/178) - Change output file: `results/GenomeBinning/QC/quast_and_busco_summary.tsv` -> `results/GenomeBinning/bin_summary.tsv`, contains GTDB-Tk results as well. +- [#191](https://github.com/nf-core/mag/pull/191) - Update to nf-core 1.14 `TEMPLATE` +- [#193](https://github.com/nf-core/mag/pull/193) - Compress CAT output files [#180](https://github.com/nf-core/mag/issues/180) +- [#198](https://github.com/nf-core/mag/pull/198) - Requires nextflow version `>= 21.04.0` +- [#200](https://github.com/nf-core/mag/pull/200) - Small changes in GitHub Actions tests +- [#203](https://github.com/nf-core/mag/pull/203) - Renamed `fastp` params and improved description in documentation: `--mean_quality` -> `--fastp_qualified_quality`, `--trimming_quality` -> `--fastp_cut_mean_quality` + +### `Fixed` + +- [#175](https://github.com/nf-core/mag/pull/175) - Fix bug in retrieving the `--max_unbinned_contigs` longest unbinned sequences that are longer than `--min_length_unbinned_contigs` (`split_fasta.py`) +- [#175](https://github.com/nf-core/mag/pull/175) - Improved runtime of `split_fasta.py` in `METABAT2` process (important for large assemblies, e.g. when computing co-assemblies) +- [#194](https://github.com/nf-core/mag/pull/194) - Allow different folder structures for Kraken2 databases containing `*.k2d` files [#187](https://github.com/nf-core/mag/issues/187) +- [#195](https://github.com/nf-core/mag/pull/195) - Fix documentation regarding required compression of input FastQ files [#160](https://github.com/nf-core/mag/issues/160) +- [#196](https://github.com/nf-core/mag/pull/196) - Add process for CAT database creation as solution for problem caused by incompatible `DIAMOND` version used for pre-built `CAT database` and `CAT classification` [#90](https://github.com/nf-core/mag/issues/90), [#188](https://github.com/nf-core/mag/issues/188) + +## v1.2.0 - 2021/02/10 + +### `Added` + +- [#146](https://github.com/nf-core/mag/pull/146) - Add `--coassemble_group` parameter to allow group-wise co-assembly +- [#146](https://github.com/nf-core/mag/pull/146) - Add `--binning_map_mode` parameter allowing different mapping strategies to compute co-abundances used for binning (`all`, `group`, `own`) +- [#149](https://github.com/nf-core/mag/pull/149) - Add two new parameters to allow custom SPAdes and MEGAHIT options (`--spades_options` and `--megahit_options`) + +### `Changed` + +- [#141](https://github.com/nf-core/mag/pull/141) - Update to nf-core 1.12.1 `TEMPLATE` +- [#143](https://github.com/nf-core/mag/pull/143) - Manifest file has to be handed over via `--input` parameter now +- [#143](https://github.com/nf-core/mag/pull/143) - Changed format of manifest input file: requires a '.tsv' suffix and additionally contains group ID +- [#143](https://github.com/nf-core/mag/pull/143) - TSV `--input` file allows now also entries containing only short reads +- [#145](https://github.com/nf-core/mag/pull/145) - When using TSV input files, uses sample IDs now for `FastQC` instead of basenames of original read files. Allows non-unique file basenames. + +### `Removed` + +- [#143](https://github.com/nf-core/mag/pull/143) - Change parameter: `--manifest` -> `--input` + +## v1.1.2 - 2020/11/24 + +### `Changed` + +- [#135](https://github.com/nf-core/mag/pull/135) - Update to nf-core 1.12 `TEMPLATE` + +### `Fixed` + +- [#133](https://github.com/nf-core/mag/pull/133) - Fixed processing of `--input` parameter [#131](https://github.com/nf-core/mag/issues/131) + +## v1.1.1 - 2020/11/10 + +### `Added` + +- [#121](https://github.com/nf-core/mag/pull/121) - Add full-size test +- [#124](https://github.com/nf-core/mag/pull/124) - Add worfklow overview figure to `README` + +### `Changed` + +- [#123](https://github.com/nf-core/mag/pull/123) - Update to new nf-core 1.11 `TEMPLATE` + +### `Fixed` + +- [#118](https://github.com/nf-core/mag/pull/118) - Fix `seaborn` to `v0.10.1` to avoid `nanoplot` error +- [#120](https://github.com/nf-core/mag/pull/120) - Fix link to CAT database in help message +- [#124](https://github.com/nf-core/mag/pull/124) - Fix description of `CAT` process in `output.md` + +## v1.1.0 - 2020/10/06 + +### `Added` + +- [#35](https://github.com/nf-core/mag/pull/35) - Add social preview image +- [#49](https://github.com/nf-core/mag/pull/49) - Add host read removal with `Bowtie 2` and according custom section to `MultiQC` +- [#49](https://github.com/nf-core/mag/pull/49) - Add separate `MultiQC` section for `FastQC` after preprocessing +- [#65](https://github.com/nf-core/mag/pull/65) - Add `MetaBAT2` RNG seed parameter `--metabat_rng_seed` and set the default to 1 which ensures reproducible binning results +- [#65](https://github.com/nf-core/mag/pull/65) - Add parameters `--megahit_fix_cpu_1`, `--spades_fix_cpus` and `--spadeshybrid_fix_cpus` to ensure reproducible results from assembly tools +- [#66](https://github.com/nf-core/mag/pull/66) - Export `depth.txt.gz` into result folder +- [#67](https://github.com/nf-core/mag/pull/67) - Compress assembly files +- [#82](https://github.com/nf-core/mag/pull/82) - Add `nextflow_schema.json` +- [#104](https://github.com/nf-core/mag/pull/104) - Add parameter `--save_busco_reference` + +### `Changed` + +- [#56](https://github.com/nf-core/mag/pull/56) - Update `MetaBAT2` from `v2.13` to `v2.15` +- [#46](https://github.com/nf-core/mag/pull/46) - Update `MultiQC` from `v1.7` to `v1.9` +- [#88](https://github.com/nf-core/mag/pull/88) - Update to new nf-core 1.10.2 `TEMPLATE` +- [#88](https://github.com/nf-core/mag/pull/88) - `--reads` is now removed, use `--input` instead +- [#101](https://github.com/nf-core/mag/pull/101) - Prevented PhiX alignments from being stored in work directory [#97](https://github.com/nf-core/mag/issues/97) +- [#104](https://github.com/nf-core/mag/pull/104), [#111](https://github.com/nf-core/mag/pull/111) - Update `BUSCO` from `v3.0.2` to `v4.1.4` + +### `Fixed` + +- [#29](https://github.com/nf-core/mag/pull/29) - Fix `MetaBAT2` binning discards unbinned contigs [#27](https://github.com/nf-core/mag/issues/27) +- [#31](https://github.com/nf-core/mag/pull/31), [#36](https://github.com/nf-core/mag/pull/36), [#76](https://github.com/nf-core/mag/pull/76), [#107](https://github.com/nf-core/mag/pull/107) - Fix links in README +- [#47](https://github.com/nf-core/mag/pull/47) - Fix missing `MultiQC` when `--skip_quast` or `--skip_busco` was specified +- [#49](https://github.com/nf-core/mag/pull/49), [#89](https://github.com/nf-core/mag/pull/89) - Added missing parameters to summary +- [#50](https://github.com/nf-core/mag/pull/50) - Fix missing channels when `--keep_phix` is specified +- [#54](https://github.com/nf-core/mag/pull/54) - Updated links to `minikraken db` +- [#54](https://github.com/nf-core/mag/pull/54) - Fixed `Kraken2` dp preparation: allow different names for compressed archive file and contained folder as for some minikraken dbs +- [#55](https://github.com/nf-core/mag/pull/55) - Fixed channel joining for multiple samples causing `MetaBAT2` error [#32](https://github.com/nf-core/mag/issues/32) +- [#57](https://github.com/nf-core/mag/pull/57) - Fix number of threads used by `MetaBAT2` program `jgi_summarize_bam_contig_depths` +- [#70](https://github.com/nf-core/mag/pull/70) - Fix `SPAdes` memory conversion issue [#61](https://github.com/nf-core/mag/issues/61) +- [#71](https://github.com/nf-core/mag/pull/71) - No more ignoring errors in `SPAdes` assembly +- [#72](https://github.com/nf-core/mag/pull/72) - No more ignoring of `BUSCO` errors +- [#73](https://github.com/nf-core/mag/pull/73), [#75](https://github.com/nf-core/mag/pull/75) - Improved output documentation +- [#96](https://github.com/nf-core/mag/pull/96) - Fix missing bin names in `MultiQC` BUSCO section [#78](https://github.com/nf-core/mag/issues/78) +- [#104](https://github.com/nf-core/mag/pull/104) - Fix `BUSCO` errors causing missing summary output [#77](https://github.com/nf-core/mag/issues/77) + +### `Deprecated` + +- [#29](https://github.com/nf-core/mag/pull/29) - Change depreciated parameters: `--singleEnd` -> `--single_end`, `--igenomesIgnore` -> `--igenomes_ignore` + +## v1.0.0 - 2019/12/20 + +Initial release of nf-core/mag, created with the [nf-core](http://nf-co.re/) template. + +### `Added` + +- short and long reads QC (fastp, porechop, filtlong, fastqc) +- Lambda and PhiX detection and filtering (bowtie2, nanolyse) +- Taxonomic classification of reads (centrifuge, kraken2) +- Short read and hybrid assembly (megahit, metaspades) +- metagenome binning (metabat2) +- QC of bins (busco, quast) +- annotation (cat/bat) diff --git a/CITATIONS.md b/CITATIONS.md index 0b6c25b1..2feb3693 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,13 +10,159 @@ ## Pipeline tools +- [AdapterRemoval2](https://doi.org/10.1186/) + + > Schubert, M., Lindgreen, S., and Orlando, L. 2016. "AdapterRemoval v2: Rapid Adapter Trimming, Identification, and Read Merging." BMC Research Notes 9 (February): 88. doi: 10.1186/s13104-016-1900-2 + +- [BBnorm/BBTools](http://sourceforge.net/projects/bbmap/) + +- [BCFtools](https://doi.org/10.1093/gigascience/giab008) + + > Danecek, Petr, et al. "Twelve years of SAMtools and BCFtools." Gigascience 10.2 (2021): giab008. doi: 10.1093/gigascience/giab008 + +- [Bowtie2](https:/dx.doi.org/10.1038/nmeth.1923) + + > Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: 10.1038/nmeth.1923. + +- [Busco](https://doi.org/10.1007/978-1-4939-9173-0_14) + + > Seppey, M., Manni, M., & Zdobnov, E. M. (2019). BUSCO: assessing genome assembly and annotation completeness. In Gene prediction (pp. 227-245). Humana, New York, NY. doi: 10.1007/978-1-4939-9173-0_14. + +- [CAT](https://doi.org/10.1186/s13059-019-1817-x) + + > von Meijenfeldt, F. B., Arkhipova, K., Cambuy, D. D., Coutinho, F. H., & Dutilh, B. E. (2019). Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome biology, 20(1), 1-14. doi: 10.1186/s13059-019-1817-x. + +- [Centrifuge](https://doi.org/10.1101/gr.210641.116) + + > Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome research, 26(12), 1721-1729. doi: 10.1101/gr.210641.116. + +- [CheckM](https://doi.org/10.1101/gr.186072.114) + + > Parks, D. H., Imelfort, M., Skennerton, C. T., Hugenholtz, P., & Tyson, G. W. (2015). CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes. Genome Research, 25(7), 1043–1055. doi: 10.1101/gr.186072.114 + +- [CheckM2](https://doi.org/10.1038/s41592-023-01940-w) + + > Chklovski, A., Parks, D. H., Woodcroft, B. J., & Tyson, G. W. (2023). CheckM2: a rapid, scalable and accurate tool for assessing microbial genome quality using machine learning. Nature Methods, 20(8), 1203-1212. doi: https://doi.org/10.1038/s41592-023-01940-w + +- [Chopper](https://doi.org/10.1093/bioinformatics/bty149) + + > De Coster W, D'Hert S, Schultz DT, Cruts M, Van Broeckhoven C. NanoPack: visualizing and processing long-read sequencing data. Bioinformatics. 2018 Aug 1;34(15):2666-2669. doi: 10.1093/bioinformatics/bty149 + +- [CONCOCT](https://doi.org/10.1038/nmeth.3103) + + > Alneberg, J., Bjarnason, B. S., de Bruijn, I., Schirmer, M., Quick, J., Ijaz, U. Z., Lahti, L., Loman, N. J., Andersson, A. F., & Quince, C. (2014). Binning metagenomic contigs by coverage and composition. Nature Methods, 11(11), 1144–1146. doi: 10.1038/nmeth.3103 + +- [DAS Tool](https://doi.org/10.1038/s41564-018-0171-1) + + > Sieber, C. M. K., et al. 2018. "Recovery of Genomes from Metagenomes via a Dereplication, Aggregation and Scoring Strategy." Nature Microbiology 3 (7): 836-43. doi: 10.1038/s41564-018-0171-1 + +- [FastP](https://doi.org/10.1093/bioinformatics/bty560) + + > Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. doi: 10.1093/bioinformatics/bty560. + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + +- [Filtlong](https://github.com/rrwick/Filtlong) + +- [Freebayes](https://arxiv.org/abs/1207.3907) + + > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 [q-bio.GN] 2012 + +- [geNomad](https://doi.org/10.1101/2023.03.05.531206) + + > Camargo, A. P., et al. (2023). You can move, but you can’t hide: identification of mobile genetic elements with geNomad. bioRxiv preprint. doi: 10.1101/2023.03.05.531206 + +- [GTDB-Tk](https://doi.org/10.1093/bioinformatics/btz848) + + > Chaumeil, P. A., Mussig, A. J., Hugenholtz, P., & Parks, D. H. (2020). GTDB-Tk: a toolkit to classify genomes with the Genome Taxonomy Database. Bioinformatics , 36(6), 1925–1927. doi: 10.1093/bioinformatics/btz848. + +- [GUNC](https://doi.org/10.1186/s13059-021-02393-0.) + + > Orakov, A., Fullam, A., Coelho, A. P., Khedkar, S., Szklarczyk, D., Mende, D. R., Schmidt, T. S. B., and Bork, P.. 2021. “GUNC: Detection of Chimerism and Contamination in Prokaryotic Genomes.” Genome Biology 22 (1): 178. doi: 10.1186/s13059-021-02393-0. + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, D et al., 2019. Improved metagenomic analysis with Kraken 2. Genome Biology volume 20, Article number: 257. doi: 10.1186/s13059-019-1891-0. + +- [Krona](https://doi.org/10.1186/1471-2105-12-385) + + > Ondov, B. D., Bergman, N. H., & Phillippy, A. M. (2011). Interactive metagenomic visualization in a Web browser. BMC bioinformatics, 12(1), 1-10. doi: 10.1186/1471-2105-12-385. + +- [MaxBin2](https://doi.org/10.1093/bioinformatics/btv638) + + > Yu-Wei, W., Simmons, B. A. & Singer, S. W. (2015) MaxBin 2.0: An Automated Binning Algorithm to Recover Genomes from Multiple Metagenomic Datasets. Bioinformatics 32 (4): 605–7. doi: 10.1093/bioinformatics/btv638. + +- [MEGAHIT](https://doi.org/10.1016/j.ymeth.2016.02.020) + + > Li, D., Luo, R., Liu, C. M., Leung, C. M., Ting, H. F., Sadakane, K., ... & Lam, T. W. (2016). MEGAHIT v1. 0: a fast and scalable metagenome assembler driven by advanced methodologies and community practices. Methods, 102, 3-11. doi: 10.1016/j.ymeth.2016.02.020. + +- [MetaBAT2](https://doi.org/10.7717/peerj.7359) + + > Kang, D. D., Li, F., Kirton, E., Thomas, A., Egan, R., An, H., & Wang, Z. (2019). MetaBAT 2: an adaptive binning algorithm for robust and efficient genome reconstruction from metagenome assemblies. PeerJ, 7, e7359. doi: 10.7717/peerj.7359. + +- [MetaEuk](https://doi.org/10.1186/s40168-020-00808-x) + + > Levy Karin, E., Mirdita, M. & Söding, J. MetaEuk—sensitive, high-throughput gene discovery, and annotation for large-scale eukaryotic metagenomics. Microbiome 8, 48 (2020). 10.1186/s40168-020-00808-x + +- [MMseqs2](https://www.nature.com/articles/nbt.3988) + + > Steinegger, M., Söding, J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017).10.1038/nbt.3988 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) -> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + +- [NanoLyse](https://doi.org/10.1093/bioinformatics/bty149) + + > De Coster, W., D’Hert, S., Schultz, D. T., Cruts, M., & Van Broeckhoven, C. (2018). NanoPack: visualizing and processing long-read sequencing data. Bioinformatics, 34(15), 2666-2669. doi: 10.1093/bioinformatics/bty149. + +- [NanoPlot](https://doi.org/10.1093/bioinformatics/bty149) + + > De Coster, W., D’Hert, S., Schultz, D. T., Cruts, M., & Van Broeckhoven, C. (2018). NanoPack: visualizing and processing long-read sequencing data. Bioinformatics, 34(15), 2666-2669. doi: 10.1093/bioinformatics/bty149. + +- [Nanoq](https://doi.org/10.21105/joss.02991) + + > Steinig, E., Coin, L. (2022). Nanoq: ultra-fast quality control for nanopore reads. Journal of Open Source Software, 7(69), 2991, doi: 10.21105/joss.02991 + +- [Porechop](https://github.com/rrwick/Porechop) + +- [Porechop-abi](https://github.com/bonsai-team/Porechop_ABI) + + > Bonenfant, Q., Noé, L., & Touzet, H. (2022). Porechop_ABI: discovering unknown adapters in ONT sequencing reads for downstream trimming. bioRxiv. 10.1101/2022.07.07.499093 + +- [Prodigal](https://pubmed.ncbi.nlm.nih.gov/20211023/) + + > Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, Hauser LJ. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics. 2010 Mar 8;11:119. doi: 10.1186/1471-2105-11-119. PMID: 20211023; PMCID: PMC2848648. + +- [Prokka](https://pubmed.ncbi.nlm.nih.gov/24642063/) + + > Seemann T. Prokka: rapid prokaryotic genome annotation. Bioinformatics. 2014 Jul 15;30(14):2068-9. doi: 10.1093/bioinformatics/btu153. Epub 2014 Mar 18. PMID: 24642063. + +- [PyDamage](https://doi.org/10.7717/peerj.11845) + + > Borry M, Hübner A, Rohrlach AB, Warinner C. 2021. PyDamage: automated ancient damage identification and estimation for contigs in ancient DNA de novo assembly. PeerJ 9:e11845 doi: 10.7717/peerj.11845 + +- [SAMtools](https://doi.org/10.1093/bioinformatics/btp352) + + > Li, H., Handsaker, B., Wysoker, A., Fennell, T., Ruan, J., Homer, N., … 1000 Genome Project Data Processing Subgroup. (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics , 25(16), 2078–2079. doi: 10.1093/bioinformatics/btp352. + +- [Seqtk](https://github.com/lh3/seqtk) + +- [SPAdes](https://doi.org/10.1101/gr.213959.116) + + > Nurk, S., Meleshko, D., Korobeynikov, A., & Pevzner, P. A. (2017). metaSPAdes: a new versatile metagenomic assembler. Genome research, 27(5), 824-834. doi: 10.1101/gr.213959.116. + +- [Tiara](https://doi.org/10.1093/bioinformatics/btab672) + + > Karlicki, M., Antonowicz, S., Karnkowska, A., 2022. Tiara: deep learning-based classification system for eukaryotic sequences. Bioinformatics 38, 344–350. doi: 10.1093/bioinformatics/btab672 + +## Data + +- [Full-size test data](https://doi.org/10.1038/s41587-019-0191-2) + + > Bertrand, D., Shaw, J., Kalathiyappan, M., Ng, A. H. Q., Kumar, M. S., Li, C., ... & Nagarajan, N. (2019). Hybrid metagenomic assembly enables high-resolution analysis of resistance determinants and mobile elements in human microbiomes. Nature biotechnology, 37(8), 937-944. doi: 10.1038/s41587-019-0191-2. ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 9a23fd99..bee71ce3 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,14 @@

- - nf-core/mag + + nf-core/mag

[![GitHub Actions CI Status](https://github.com/nf-core/mag/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/mag/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/mag/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/mag/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/mag/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/mag/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/mag/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3589527-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3589527) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) +[![Cite Publication](https://img.shields.io/badge/Cite%20Us!-Cite%20Publication-orange)](https://doi.org/10.1093/nargab/lqac007) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -19,48 +20,47 @@ ## Introduction -**nf-core/mag** is a bioinformatics pipeline that ... +**nf-core/mag** is a bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes. - +

+ nf-core/mag workflow overview +

- -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +## Pipeline summary ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -Now, you can run the pipeline using: +```bash +nextflow run nf-core/mag -profile --input '*_R{1,2}.fastq.gz' --outdir +``` - +or ```bash -nextflow run nf-core/mag \ - -profile \ - --input samplesheet.csv \ - --outdir +nextflow run nf-core/mag -profile --input samplesheet.csv --outdir ``` > [!WARNING] @@ -74,13 +74,42 @@ To see the results of an example test run with a full size dataset refer to the For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/mag/output). +### Group-wise co-assembly and co-abundance computation + +Each sample has an associated group ID (see [input specifications](https://nf-co.re/mag/usage#input_specifications)). This group information can be used for group-wise co-assembly with `MEGAHIT` or `SPAdes` and/or to compute co-abundances for the binning step with `MetaBAT2`. By default, group-wise co-assembly is disabled, while the computation of group-wise co-abundances is enabled. For more information about how this group information can be used see the documentation for the parameters [`--coassemble_group`](https://nf-co.re/mag/parameters#coassemble_group) and [`--binning_map_mode`](https://nf-co.re/mag/parameters#binning_map_mode). + +When group-wise co-assembly is enabled, `SPAdes` is run on accordingly pooled read files, since `metaSPAdes` does not yet allow the input of multiple samples or libraries. In contrast, `MEGAHIT` is run for each group while supplying lists of the individual readfiles. + ## Credits -nf-core/mag was originally written by Hadrien Gourlé, Daniel Straub, Sabrina Krakau, James A. Fellows Yates, Maxime Borry. +nf-core/mag was written by [Hadrien Gourlé](https://hadriengourle.com) at [SLU](https://slu.se), [Daniel Straub](https://github.com/d4straub) and [Sabrina Krakau](https://github.com/skrakau) at the [Quantitative Biology Center (QBiC)](http://qbic.life). [James A. Fellows Yates](https://github.com/jfy133) and [Maxime Borry](https://github.com/maxibor) at the [Max Planck Institute for Evolutionary Anthropology](https://www.eva.mpg.de) joined in version 2.2.0. + +Other code contributors include: + +- [Antonia Schuster](https://github.com/AntoniaSchuster) +- [Alexander Ramos](https://github.com/alxndrdiaz) +- [Carson Miller](https://github.com/CarsonJM) +- [Daniel Lundin](https://github.com/erikrikarddaniel) +- [Danielle Callan](https://github.com/d-callan) +- [Gregory Sprenger](https://github.com/gregorysprenger) +- [Jim Downie](https://github.com/prototaxites) +- [Phil Palmer](https://github.com/PhilPalmer) +- [@willros](https://github.com/willros) +- [Adam Rosenbaum](https://github.com/muabnezor) +- [Diego Alvarez](https://github.com/dialvarezs) + +Long read processing was inspired by [caspargross/HybridAssembly](https://github.com/caspargross/HybridAssembly) written by Caspar Gross [@caspargross](https://github.com/caspargross) We thank the following people for their extensive assistance in the development of this pipeline: - +- [Alexander Peltzer](https://github.com/apeltzer) +- [Phil Ewels](https://github.com/ewels) +- [Gisela Gabernet](https://github.com/ggabernet) +- [Harshil Patel](https://github.com/drpatelh) +- [Johannes Alneberg](https://github.com/alneberg) +- [Maxime Garcia](https://github.com/MaxUlysse) +- [Michael L Heuer](https://github.com/heuermh) +- [Alex Hübner](https://github.com/alexhbnr) ## Contributions and Support @@ -90,10 +119,15 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/mag for your analysis, please cite the preprint as follows: + +> **nf-core/mag: a best-practice pipeline for metagenome hybrid assembly and binning** +> +> Sabrina Krakau, Daniel Straub, Hadrien Gourlé, Gisela Gabernet, Sven Nahnsen. +> +> NAR Genom Bioinform. 2022 Feb 2;4(1):lqac007. doi: [10.1093/nargab/lqac007](https://doi.org/10.1093/nargab/lqac007). - +Additionally you can cite the pipeline directly with the following doi: [10.5281/zenodo.3589527](https://doi.org/10.5281/zenodo.3589527) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz b/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz new file mode 100644 index 00000000..2fa0ed2a Binary files /dev/null and b/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz differ diff --git a/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz b/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz new file mode 100644 index 00000000..511f57e8 Binary files /dev/null and b/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz differ diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 7bbc48fc..d3fe4abe 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -7,7 +7,7 @@ plot_type: "html" ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/mag v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

+

Data was processed using nf-core/mag v${workflow.manifest.version} (${doi_text}; Krakau et al., 2022) of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}

${tool_citations}

@@ -15,6 +15,7 @@ data: |
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Krakau, S., Straub, D., Gourlé, H., Gabernet, G., & Nahnsen, S. (2022). nf-core/mag: a best-practice pipeline for metagenome hybrid assembly and binning. NAR Genomics and Bioinformatics, 4(1). https://doi.org/10.1038/s41587-020-0439-x
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • ${tool_bibliography} diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index a4c656e2..5b76f018 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -13,3 +13,249 @@ report_section_order: export_plots: true disable_version_detection: true +data_format: "yaml" + +run_modules: + - fastqc + - fastp + - adapterRemoval + - custom_content + - bowtie2 + - busco + - quast + - kraken + - prokka + - porechop + - filtlong + +## Module order +top_modules: + - "fastqc": + name: "FastQC: raw reads" + path_filters_exclude: + - "*trimmed*" + - "fastp" + - "adapterRemoval" + - "porechop" + - "filtlong" + - "fastqc": + name: "FastQC: after preprocessing" + info: "After trimming and, if requested, contamination removal." + path_filters: + - "*trimmed*" + - "bowtie2": + name: "Bowtie2: PhiX removal" + info: "Mapping statistics of reads mapped against PhiX and subsequently removed." + path_filters: + - "*_phix_removed.bowtie2.log" + - "bowtie2": + name: "Bowtie2: host removal" + info: "Mapping statistics of reads mapped against host genome and subsequently + removed." + path_filters: + - "*_host_removed.bowtie2.log" + - "kraken": + name: "Kraken2" + anchor: "Kraken2" + target: "Kraken2" + doi: "10.1101/gr.210641.116" + path_filters: + - "*.kraken2_report.txt" + - "kraken": + name: "Centrifuge" + anchor: "centrifuge" + target: "Centrifuge" + doi: "10.1101/gr.210641.116" + info: "is a very rapid and memory-efficient system for the classification of DNA + sequences from microbial samples. The system uses a novel indexing scheme based + on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. + Note: Figure title" + extra: "ℹ️: plot title will say Kraken2 due to Centrifuge producing the same output + format as Kraken. If activated, see the actual Kraken2 results in the section + above." + path_filters: + - "*.centrifuge_kreport.txt" + - "quast": + name: "QUAST: assembly" + info: "Assembly statistics of raw assemblies." + path_filters: + - "*rawassemblies.tsv" + - "bowtie2": + name: "Bowtie2: assembly" + info: "Mapping statistics of reads mapped against assemblies." + path_filters_exclude: + - "*_host_removed.bowtie2.log" + - "*_phix_removed.bowtie2.log" + - "bcftools" + - custom_content + - "quast": + name: "QUAST: bins" + info: "Assembly statistics of binned assemblies." + path_filters_exclude: + - "*rawassemblies.tsv" + - "busco": + info: "assesses genome assembly and annotation completeness with Benchmarking + Universal Single-Copy Orthologs. In case BUSCO's automated lineage selection + was used, only generic results for the selected domain are shown and only for + genome bins and kept, unbinned contigs for which the BUSCO analysis was successfull, + i.e. not for contigs for which no BUSCO genes could be found. Bins for which + a specific virus lineage was selected are also not shown." + - "prokka" + +custom_data: + host_removal: + file_format: "tsv" + section_name: "Bowtie 2: host read removal" + description: "Reads are mapped against the host reference sequence. Only reads + that do not align (concordantly) are kept for further analysis." + plot_type: "bargraph" + pconfig: + id: "host_removal_bargraph" + title: "Bowtie 2: reads mapped against host reference" + ylab: "# Reads" + +## Sample name cleaning +sp: + host_removal: + fn: "host_removal_metrics.tsv" + adapterRemoval: + fn: "*_ar2.settings" + kraken: + fn_re: ".*[kraken2|centrifuge].*report.txt" + quast: + fn_re: "report.*.tsv" + filtlong: + num_lines: 20 + fn_re: ".*_filtlong.log" + +## File name cleaning +extra_fn_clean_exts: + - ".bowtie2" + - "_ar2" + - "host_removed" + - "phix_removed" + - "centrifuge_kreport" + - "_fastp" + +## Prettification +custom_logo: "mag_logo_mascot_light.png" +custom_logo_url: https://github.com/nf-core/mag/ +custom_logo_title: "nf-core/mag" + +## Tool specific configuration +prokka_fn_snames: true + +## General Stats customisation +table_columns_visible: + "FastQC: raw reads": + avg_sequence_length: true + "FastQC: after preprocessing": + avg_sequence_length: true + "fastp": + pct_duplication: false + after_filtering_q30_rate: false + after_filtering_q30_bases: false + filtering_result_passed_filter_reads: 3300 + after_filtering_gc_content: false + pct_surviving: true + pct_adapter: true + "Kraken2": false + "Centrifuge": false + "QUAST: assembly": + N75: true + L50: true + L75: true + "Largest contig": true + "Total length": true + N50: true + "Bowtie2: assembly": false + "QUAST: bins": + N75: true + L50: true + L75: true + "Largest contig": true + "Total length": true + N50: true + "Prokka": false + +table_columns_placement: + "FastQC: raw reads": + percent_duplicates: 1000 + percent_gc: 1100 + avg_sequence_length: 1200 + median_sequence_length: 1300 + total_sequences: 1400 + percent_fails: 1500 + "FastQC: after preprocessing": + percent_duplicates: 2000 + percent_gc: 2100 + avg_sequence_length: 2200 + median_sequence_length: 2300 + total_sequences: 2400 + percent_fails: 2500 + "fastp": + pct_duplication: 3000 + after_filtering_q30_rate: 3100 + after_filtering_q30_bases: 3200 + filtering_result_passed_filter_reads: 3300 + after_filtering_gc_content: 3400 + pct_surviving: 3500 + pct_adapter: 3600 + "Adapter Removal": + percent_aligned: 4000 + aligned_total: 4100 + percent_discarded: 4200 + "Bowtie2: PhiX removal": + overall_alignment_rate: 5000 + "Bowtie2: host removal": + overall_alignment_rate: 6000 + "Kraken2": + "% root": 8000 + "% Top 5": 8100 + "% Unclassified": 8200 + "Centrifuge": + "% root": 9000 + "% Top 5": 9100 + "% Unclassified": 9200 + "QUAST: assembly": + "N50": 10000 + "Total length": 11000 + "Bowtie2: assembly": + overall_alignment_rate: 7000 + "QUAST: bins": + "N50": 10000 + "Total length": 11000 + Prokka: + contigs: 20000 + bases: 21000 + CDS: 22000 + organism: 23000 + +table_columns_name: + "FastQC: raw reads": + percent_duplicates: "% Dups (raw)" + percent_gc: "% GC (raw)" + avg_sequence_length: "Avg. length (raw)" + median_sequence_length: "Median length (raw)" + total_sequences: "M Seqs (raw)" + percent_fails: "% Fails (raw)" + "FastQC: after preprocessing": + percent_duplicates: "% Dups (processed)" + percent_gc: "% GC (processed)" + avg_sequence_length: "Avg. length (processed)" + median_sequence_length: "Median length (processed)" + total_sequences: "M Seqs (processed)" + percent_fails: "% Fails (processed)" + "Bowtie2: PhiX removal": + overall_alignment_rate: "% Aligned (PhiX)" + "Bowtie2: host removal": + overall_alignment_rate: "% Aligned (Host)" + "Bowtie2: assembly": + overall_alignment_rate: "% Aligned (Assem.)" + +custom_table_header_config: + general_stats_table: + "Total length": + hidden: true + N50: + hidden: true diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab7..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_assembly_input.json b/assets/schema_assembly_input.json new file mode 100644 index 00000000..404845b9 --- /dev/null +++ b/assets/schema_assembly_input.json @@ -0,0 +1,35 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/nf-core/mag/master/assets/schema_input.json", + "title": "nf-core/mag pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string", + "pattern": "^\\S+$", + "meta": ["id"] + }, + "group": { + "type": ["string", "integer"], + "pattern": "^\\S+$", + "meta": ["group"] + }, + "assembler": { + "type": "string", + "pattern": "MEGAHIT|SPAdes|SPAdesHybrid", + "meta": ["assembler"] + }, + "fasta": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$", + "exists": true + } + }, + "required": ["id", "group", "assembler", "fasta"] + }, + "allOf": [{ "uniqueEntries": ["fasta"] }] +} diff --git a/assets/schema_input.json b/assets/schema_input.json index 9485ea88..01b494b5 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,24 +10,43 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "run": { + "type": ["string", "integer"], + "pattern": "^\\S+$", + "meta": ["run"], + "unique": ["sample"] + }, + "group": { + "type": ["string", "integer"], + "pattern": "^\\S+$", + "meta": ["group"] + }, + "short_reads_1": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + "short_reads_2": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.f(ast)?q\\.gz$" }, - "fastq_2": { + "long_reads": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.f(ast)?q\\.gz$" } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "group", "short_reads_1"] + }, + "uniqueEntries": ["sample", "run"], + "dependentRequired": { + "short_reads_2": ["short_reads_1"], + "long_reads": ["short_reads_1", "short_reads_2"] } } diff --git a/bin/combine_tables.py b/bin/combine_tables.py new file mode 100755 index 00000000..2b8d3767 --- /dev/null +++ b/bin/combine_tables.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python + +## Originally written by Daniel Straub and Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +import argparse +import sys + +import pandas as pd + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--depths_summary", + required=True, + metavar="FILE", + help="Bin depths summary file.", + ) + parser.add_argument("-b", "--binqc_summary", metavar="FILE", help="BUSCO summary file.") + parser.add_argument("-q", "--quast_summary", metavar="FILE", help="QUAST BINS summary file.") + parser.add_argument("-g", "--gtdbtk_summary", metavar="FILE", help="GTDB-Tk summary file.") + parser.add_argument("-a", "--cat_summary", metavar="FILE", help="CAT table file.") + parser.add_argument( + "-t", "--binqc_tool", help="Bin QC tool used", choices=["busco", "checkm", "checkm2"] + ) + + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final summary.", + ) + return parser.parse_args(args) + + +def parse_cat_table(cat_table): + """Parse CAT table. + + CAT table is trickier to parse than the other tables, because it has a variable number of columns, + depending on the number of ranks that are reported for the taxonomic assignation of each contig. + Therefore, we first parse the header to get the column names, and then parse the table, to get the + maximum number of columns. Then, we merge the columns containing the ranks into a single column. + + Args: + cat_table (str): Path to CAT table + + Returns: + pd.DataFrame: parse CAT table + """ + with open(cat_table, "r") as f: + next(f) # skip header + maxcol = 0 + for line in f: + maxcol = max(maxcol, len(line.split("\t"))) + + header = [ + "bin", + "classification", + "reason", + "lineage", + "lineage scores", + "full lineage names", + ] + + df = pd.read_table( + cat_table, + names=header + [f"rank_{i}" for i in range(maxcol - len(header))], + on_bad_lines="warn", + header=None, + skiprows=1, + ) + # merge all rank columns into a single column + df["CAT_rank"] = ( + df.filter(regex="rank_\d+").apply(lambda x: ";".join(x.dropna()), axis=1).str.lstrip() + ) + # remove rank_* columns + df.drop(df.filter(regex="rank_\d+").columns, axis=1, inplace=True) + + return df + + +def main(args=None): + args = parse_args(args) + + if ( + not args.binqc_summary + and not args.quast_summary + and not args.gtdbtk_summary + ): + sys.exit( + "No summary specified! " + "Please specify at least BUSCO, CheckM, CheckM2 or QUAST summary." + ) + + # GTDB-Tk can only be run in combination with BUSCO, CheckM or CheckM2 + if args.gtdbtk_summary and not args.binqc_summary: + sys.exit( + "Invalid parameter combination: " + "GTDB-TK summary specified, but no BUSCO, CheckM or CheckM2 summary!" + ) + + # handle bin depths + results = pd.read_csv(args.depths_summary, sep="\t") + results.columns = ["Depth " + str(col) if col != "bin" else col for col in results.columns] + bins = results["bin"].sort_values().reset_index(drop=True) + + if args.binqc_summary and args.binqc_tool == "busco": + busco_results = pd.read_csv(args.binqc_summary, sep="\t") + if not bins.equals(busco_results["GenomeBin"].sort_values().reset_index(drop=True)): + sys.exit("Bins in BUSCO summary do not match bins in bin depths summary!") + results = pd.merge( + results, busco_results, left_on="bin", right_on="GenomeBin", how="outer" + ) # assuming depths for all bins are given + + if args.binqc_summary and args.binqc_tool == "checkm": + use_columns = [ + "Bin Id", + "Marker lineage", + "# genomes", + "# markers", + "# marker sets", + "Completeness", + "Contamination", + "Strain heterogeneity", + "Coding density", + "Translation table", + "# predicted genes", + "0", + "1", + "2", + "3", + "4", + "5+", + ] + checkm_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") + checkm_results["Bin Id"] = checkm_results["Bin Id"] + ".fa" + if not set(checkm_results["Bin Id"]).issubset(set(bins)): + sys.exit("Bins in CheckM summary do not match bins in bin depths summary!") + results = pd.merge( + results, checkm_results, left_on="bin", right_on="Bin Id", how="outer" + ) # assuming depths for all bins are given + results["Bin Id"] = results["Bin Id"].str.removesuffix(".fa") + + if args.binqc_summary and args.binqc_tool == "checkm2": + use_columns = [ + "Name", + "Completeness", + "Contamination", + "Completeness_Model_Used", + "Coding_Density", + "Translation_Table_Used", + "Total_Coding_Sequences", + ] + checkm2_results = pd.read_csv(args.binqc_summary, usecols=use_columns, sep="\t") + checkm2_results["Name"] = checkm2_results["Name"] + ".fa" + if not set(checkm2_results["Name"]).issubset(set(bins)): + sys.exit("Bins in CheckM2 summary do not match bins in bin depths summary!") + results = pd.merge( + results, checkm2_results, left_on="bin", right_on="Name", how="outer" + ) # assuming depths for all bins are given + results["Name"] = results["Name"].str.removesuffix(".fa") + + if args.quast_summary: + quast_results = pd.read_csv(args.quast_summary, sep="\t") + if not bins.equals(quast_results["Assembly"].sort_values().reset_index(drop=True)): + sys.exit("Bins in QUAST summary do not match bins in bin depths summary!") + results = pd.merge( + results, quast_results, left_on="bin", right_on="Assembly", how="outer" + ) # assuming depths for all bins are given + + if args.gtdbtk_summary: + gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t") + if len(set(gtdbtk_results["user_genome"].to_list()).difference(set(bins))) > 0: + sys.exit("Bins in GTDB-Tk summary do not match bins in bin depths summary!") + results = pd.merge( + results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer" + ) # assuming depths for all bins are given + + if args.cat_summary: + cat_results = parse_cat_table(args.cat_summary) + if len(set(cat_results["bin"].to_list()).difference(set(bins))) > 0: + sys.exit("Bins in CAT summary do not match bins in bin depths summary!") + results = pd.merge( + results, + cat_results[["bin", "CAT_rank"]], + left_on="bin", + right_on="bin", + how="outer", + ) + + results.to_csv(args.out, sep="\t") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/domain_classification.R b/bin/domain_classification.R new file mode 100755 index 00000000..33530ca5 --- /dev/null +++ b/bin/domain_classification.R @@ -0,0 +1,156 @@ +#!/usr/bin/env Rscript + +## Written by Jim Downie and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +library(optparse) +library(tidyverse) + +parser <- OptionParser() +parser <- add_option(parser, c("-t", "--classification_file"), + action = "store", + type = "character", + metavar = "character", + help = "The out.txt tsv file of per-contig classifications from Tiara.") +parser <- add_option(parser, c("-s", "--contig_to_bin"), + action = "store", + type = "character", + metavar = "character", + help = "A tsv file with two columns, bin and contig, listing the contig membership for each bin.") +parser <- add_option(parser, c("-j", "--join_prokaryotes"), + action = "store_true", + type = "logical", + default = TRUE, + metavar = "logical", + help = "Use an general prokaryote classification instead of separating Archaea and Bacteria.") +parser <- add_option(parser, c("-a", "--assembler"), + action = "store", + type = "character", + metavar = "character", + help = "Assembler used to assemble the contigs. 'MEGAHIT' or 'SPAdes' only.") +parser <- add_option(parser, c("-o", "--output_prefix"), + action = "store", + type = "character", + metavar = "character", + help = "Prefix for the output classification table name.") +args <- parse_args(parser) + +## optparse doesn't have a required flag so exit if we don't get given a file +if(is.null(args$classification_file)) { + stop("Tiara classification file not provided.") +} +if(is.null(args$contig_to_bin)) { + stop("Contig to bin file not provided.") +} +if(is.null(args$assembler)) { + stop("Assembler not provided.") +} +if(!(args$assembler %in% c("MEGAHIT", "SPAdes"))) { + stop("Invalid assembler provided.") +} + +find_classification <- function(probabilities, join_prokaryotes = TRUE) { + if(join_prokaryotes) { + classifications <- c("prokarya", "eukarya", "organelle", "unknown") + } else { + classifications <- c("archaea", "bacteria", "eukarya", "organelle", "unknown") + } + return(classifications[which.max(probabilities)]) +} + +classify_bins <- function(tiara, contig2bin, join_prokaryotes, assembler){ + ## MEGAHIT produces contigs with spaces in the name + ## Depending on the binner, everything after the first space is sometimes dropped + ## Make sure that we drop everything after a possible space before doing anything else to allow merging + if(assembler == "MEGAHIT"){ + tiara$sequence_id <- word(tiara$sequence_id) + contig2bin$sequence_id <- word(contig2bin$sequence_id) + } + if(join_prokaryotes) { + n_classifications <- 4 + } else { + n_classifications <- 5 + } + + ## combination of left_join and filter collectively eliminate unclassified contigs + tiara <- tiara |> + left_join(contig2bin) |> + filter(!is.na(BinID)) |> + select(sequence_id, + BinID, + Archaea = arc, + Bacteria = bac, + Eukarya = euk, + Organelle = org, + Unknown = unk1) + + if(join_prokaryotes) { + tiara <- tiara |> + mutate(Prokarya = Archaea + Bacteria) |> + select(sequence_id, BinID, Prokarya, Eukarya, Organelle, Unknown) + } + + ## Identify the columns to softmax + prob_columns <- 2:(2 + n_classifications - 1) + + ## Calculate softmax probabilites based on summed bin probabilities for each category + softmax_probabilities <- tiara |> + group_by(BinID) |> + summarise(across(all_of(prob_columns), sum), .groups = "drop") |> + rowwise() |> + mutate(denominator = sum(exp(c_across(all_of(prob_columns))))) |> + mutate(across(all_of(prob_columns), \(x) exp(x)/denominator), + classification = find_classification(c_across(all_of(prob_columns)), + join_prokaryotes = join_prokaryotes)) |> + select(-denominator) + + ## A bin may have no classified contigs if all contigs are below the minimum + ## Tiara length threshold + all_bins <- unique(contig2bin$BinID) + unclassified_bins <- all_bins[!(all_bins %in% softmax_probabilities$BinID)] + + ## Assign these as unclassified + if(length(unclassified_bins) > 0) { + if(join_prokaryotes == TRUE){ + unclassified_bins_tbl <- tibble( + BinID = unclassified_bins, + Prokarya = NA, + Eukarya = NA, + Organelle = NA, + Unknown = NA, + classification = "unknown" + ) + } else { + unclassified_bins_tbl <- tibble( + BinID = unclassified_bins, + Bacteria = NA, + Archaea = NA, + Eukarya = NA, + Organelle = NA, + Unknown = NA, + classification = "unknown" + ) + } + softmax_probabilities <- bind_rows(softmax_probabilities, unclassified_bins_tbl) + } + + return(softmax_probabilities) +} + +classifications <- read_tsv(args$classification_file, na = c("NA", "n/a")) +contig_to_bin <- read_tsv(args$contig_to_bin, col_names = c("sequence_id", "BinID")) + +results <- classify_bins(tiara = classifications, + contig2bin = contig_to_bin, + join_prokaryotes = args$join_prokaryotes, + assembler = args$assembler) + +## Keep just the classifications so we can loop over more easily +results_basic <- select(results, BinID, classification) + +## write outputs +write_tsv(results, paste0(args$output_prefix, ".binclassification.tsv")) +write_tsv(results_basic, "bin2classification.tsv", col_names = FALSE) + +## write out package versions +packageVersion("tidyverse") |> as.character() |> writeLines("tidyverse_version.txt") diff --git a/bin/filter_ssu.py b/bin/filter_ssu.py new file mode 100755 index 00000000..5e4675e4 --- /dev/null +++ b/bin/filter_ssu.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +## Originally written by Hadrien Gourlé and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +from __future__ import print_function + +import os +import sys +import argparse + + +def filter(args): + """filter blast hits from refinem + + Args: + args (obj): arguments from argparse + """ + with open(args.ssu, "r") as i, open(args.output, "w") as o: + header = i.readline() + for line in i: + splitted_line = line.split() + evalue = splitted_line[7] + align_length = splitted_line[8] + percent_ident = splitted_line[9] + + if int(evalue) <= args.evalue: + o.write(line) + else: + continue + + +def main(): + parser = argparse.ArgumentParser( + prog="filter_ssu.py", usage="filter ssu hits from refinem" + ) + parser.add_argument("--evalue", help="evalue threshold") + parser.add_argument( + "ssu", metavar="ssu.tsv", help="ssu tsv file generated by refinem" + ) + parser.add_argument( + "output", metavar="output.tsv", default="output.tsv", help="output file name" + ) + parser.set_defaults(func=filter) + args = parser.parse_args() + + try: + args.func(args) + except AttributeError as e: + parser.print_help() + raise + + +if __name__ == "__main__": + main() diff --git a/bin/get_mag_depths.py b/bin/get_mag_depths.py new file mode 100755 index 00000000..43ce3539 --- /dev/null +++ b/bin/get_mag_depths.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +## Originally written by Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +import sys +import argparse +import os.path +import pandas as pd +import csv +import gzip +import statistics + +from Bio import SeqIO + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-b", + "--bins", + required=True, + nargs="+", + metavar="FILE", + help="Bins: FASTA containing all contigs.", + ) + parser.add_argument( + "-d", + "--depths", + required=True, + metavar="FILE", + help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].", + ) + parser.add_argument( + "-a", "--assembler", required=True, type=str, help="Assembler name." + ) + parser.add_argument( + "-i", "--id", required=True, type=str, help="Sample or group id." + ) + parser.add_argument( + "-m", "--binner", required=True, type=str, help="Binning method." + ) + return parser.parse_args(args) + + +# Processing contig depths for each binner again, i.e. not the most efficient way, but ok + + +def main(args=None): + args = parse_args(args) + + # load contig depths for all samples into dict (could use pandas as well) + sample_names = [] + dict_contig_depths = {} + with gzip.open(args.depths, "rt") as infile: + reader = csv.reader(infile, delimiter="\t") + # process header + header = next(reader) + for sample in range(int((len(header) - 3) / 2)): + col_name = header[3 + 2 * sample] + # retrieve sample name: "--.bam" + sample_name = col_name[len(args.assembler) + 1 + len(args.id) + 1 : -4] + sample_names.append(sample_name) + # process contig depths + for row in reader: + contig_depths = [] + for sample in range(int((len(row) - 3) / 2)): + contig_depths.append(float(row[3 + 2 * sample])) + dict_contig_depths[str(row[0])] = contig_depths + + # Initialize output files + n_samples = len(sample_names) + with open( + args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w" + ) as outfile: + print("bin", "\t".join(sample_names), sep="\t", file=outfile) + + # for each bin, access contig depths and compute mean bin depth (for all samples) + for file in args.bins: + all_depths = [[] for i in range(n_samples)] + + if file.endswith(".gz"): + with gzip.open(file, "rt") as infile: + for rec in SeqIO.parse(infile, "fasta"): + contig_depths = dict_contig_depths[rec.id] + for sample in range(n_samples): + all_depths[sample].append(contig_depths[sample]) + else: + with open(file, "rt") as infile: + for rec in SeqIO.parse(infile, "fasta"): + contig_depths = dict_contig_depths[rec.id] + for sample in range(n_samples): + all_depths[sample].append(contig_depths[sample]) + + binname = os.path.basename(file) + with open( + args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a" + ) as outfile: + print( + binname, + "\t".join( + str(statistics.median(sample_depths)) + for sample_depths in all_depths + ), + sep="\t", + file=outfile, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/get_mag_depths_summary.py b/bin/get_mag_depths_summary.py new file mode 100755 index 00000000..69433371 --- /dev/null +++ b/bin/get_mag_depths_summary.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +## Originally written by Sabrina Krakau and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +import sys +import argparse +import pandas as pd + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--depths", + required=True, + nargs="+", + metavar="FILE", + help="TSV file for each assembly and binning method containing bin depths for samples: bin, sample1, ....", + ) + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing depths for all assemblies, binning methods and all samples.", + ) + return parser.parse_args(args) + + +def main(args=None): + args = parse_args(args) + + results = pd.DataFrame() + for assembly_depths_file in args.depths: + assembly_results = pd.read_csv(assembly_depths_file, index_col="bin", sep="\t") + results = results.append(assembly_results, sort=True, verify_integrity=True) + + results.to_csv(args.out, sep="\t") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/multiqc_to_custom_tsv.py b/bin/multiqc_to_custom_tsv.py new file mode 100755 index 00000000..4388fb26 --- /dev/null +++ b/bin/multiqc_to_custom_tsv.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python + +## Copied from nf-core/viralrecon and adjusted +## See git repository (https://github.com/nf-core/viralrecon) for full license text. + + +import os +import sys +import errno +import argparse +import yaml + + +def parse_args(args=None): + Description = "Create custom spreadsheet for pertinent MultiQC bowtie 2 metrics generated by the nf-core/mag pipeline." + Epilog = "Example usage: python multiqc_to_custom_tsv.py" + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "-md", + "--multiqc_data_dir", + type=str, + dest="MULTIQC_DATA_DIR", + default="multiqc_data", + help="Full path to directory containing YAML files for each module, as generated by MultiQC. (default: 'multiqc_data').", + ) + parser.add_argument( + "-se", + "--single_end", + dest="SINGLE_END", + action="store_true", + help="Specifies that the input is single-end reads.", + ) + return parser.parse_args(args) + + +def make_dir(path): + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + + +# Find key in dictionary created from YAML file recursively +# From https://stackoverflow.com/a/37626981 +def find_tag(d, tag): + if tag in d: + yield d[tag] + for k, v in d.items(): + if isinstance(v, dict): + for i in find_tag(v, tag): + yield i + + +def yaml_fields_to_dict(YAMLFile, AppendDict={}, FieldMappingList=[]): + with open(YAMLFile) as f: + yaml_dict = yaml.safe_load(f) + for k in yaml_dict.keys(): + key = k + if key not in AppendDict: + AppendDict[key] = {} + if FieldMappingList != []: + for i, j in FieldMappingList: + val = list(find_tag(yaml_dict[k], j[0])) + if len(val) != 0: + val = val[0] + if len(j) == 2: + val = list(find_tag(val, j[1]))[0] + if i not in AppendDict[key]: + AppendDict[key][i] = val + else: + print( + "WARNING: {} key already exists in dictionary so will be overwritten. YAML file {}.".format( + i, YAMLFile + ) + ) + else: + AppendDict[key] = yaml_dict[k] + return AppendDict + + +# customized +def metrics_dict_to_file(FileFieldList, MultiQCDataDir, OutFile, se): + MetricsDict = {} + FieldList = [] + for yamlFile, mappingList in FileFieldList: + yamlFile = os.path.join(MultiQCDataDir, yamlFile) + if os.path.exists(yamlFile): + MetricsDict = yaml_fields_to_dict( + YAMLFile=yamlFile, AppendDict=MetricsDict, FieldMappingList=mappingList + ) + FieldList += [x[0] for x in mappingList] + else: + print("WARNING: File does not exist: {}".format(yamlFile)) + + if MetricsDict != {}: + make_dir(os.path.dirname(OutFile)) + with open(OutFile, "w") as fout: + if se: + fout.write( + "{}\n".format( + "\t".join( + [ + "Sample", + "SE reads not mapped (kept)", + "SE reads mapped (discarded)", + ] + ) + ) + ) + else: + fout.write( + "{}\n".format( + "\t".join( + [ + "Sample", + "PE reads not mapped concordantly (kept)", + "PE reads mapped concordantly (discarded)", + ] + ) + ) + ) + for k in sorted(MetricsDict.keys()): + # write out # not mapped reads and # mapped reads (uniquely + multi mapping reads) + fout.write( + "{}\n".format( + "\t".join( + [ + k, + str(MetricsDict[k][FieldList[0]]), + str( + MetricsDict[k][FieldList[1]] + + MetricsDict[k][FieldList[2]] + ), + ] + ) + ) + ) + return MetricsDict + + +def main(args=None): + args = parse_args(args) + + ## File names for MultiQC YAML along with fields to fetch from each file + Bowtie2FileFieldList = [] + if args.SINGLE_END: + Bowtie2FileFieldList = [ + ( + "multiqc_bowtie2.yaml", + [ + ("# Not mapped reads", ["unpaired_aligned_none"]), + ("# Mapped reads 1", ["unpaired_aligned_one"]), + ("# Mapped reads multi", ["unpaired_aligned_multi"]), + ], + ), + ] + else: + Bowtie2FileFieldList = [ + ( + "multiqc_bowtie2.yaml", + [ + ("# Not mapped reads", ["paired_aligned_none"]), + ("# Mapped reads 1", ["paired_aligned_one"]), + ("# Mapped reads multi", ["paired_aligned_multi"]), + ], + ), + ] + + ## Write Bowtie 2 metrics to file + metrics_dict_to_file( + FileFieldList=Bowtie2FileFieldList, + MultiQCDataDir=args.MULTIQC_DATA_DIR, + OutFile="host_removal_metrics.tsv", + se=args.SINGLE_END, + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/plot_mag_depths.py b/bin/plot_mag_depths.py new file mode 100755 index 00000000..d3782845 --- /dev/null +++ b/bin/plot_mag_depths.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +# Originally written by Sabrina Krakau and released under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + +import sys +import argparse +import os.path +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from scipy import stats + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-d", + "--bin_depths", + required=True, + metavar="FILE", + help="Bin depths file in TSV format (for one assembly and binning method): bin, sample1_depth, sample2_depth, ....", + ) + parser.add_argument( + "-g", + "--groups", + required=True, + metavar="FILE", + help="File in TSV format containing group information for samples: sample, group", + ) + parser.add_argument( + "-o", "--out", required=True, metavar="FILE", type=str, help="Output file." + ) + return parser.parse_args(args) + + +def main(args=None): + args = parse_args(args) + + # load data + df = pd.read_csv(args.bin_depths, sep="\t", index_col=0) + groups = pd.read_csv(args.groups, sep="\t", index_col=0, names=["sample", "group"]) + + # add pseudo-abundances (sample-wise? dependent on lib-size) + pseudo_cov = 0.1 * df[df > 0].min().min() + df.replace(0, pseudo_cov, inplace=True) + # compute centered log-ratios + # divide df by sample-wise geometric means + gmeans = stats.gmean(df, axis=0) # apply on axis=0: 'index' + df = np.log( + df.div(gmeans, axis="columns") + ) # divide column-wise (axis=1|'columns'), take natural logorithm + df.index.name = "MAGs" + df.columns.name = "Samples" + + # prepare colors for group information + color_map = dict( + zip( + groups["group"].unique(), + sns.color_palette(n_colors=len(groups["group"].unique())), + ) + ) + + # plot + plt.figure() + bin_labels = True + if len(df) > 30: + bin_labels = False + sns.clustermap( + df, + row_cluster=True, + yticklabels=bin_labels, + cmap="vlag", + center=0, + col_colors=groups.group.map(color_map), + figsize=(6, 6), + ) + plt.savefig(args.out) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/run_busco.sh b/bin/run_busco.sh new file mode 100755 index 00000000..b0864e22 --- /dev/null +++ b/bin/run_busco.sh @@ -0,0 +1,166 @@ +#! /usr/bin/env bash + +# Originally written by Sabrina Krakau and James Fellows Yates and released +# under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + +p=$1 +cp_augustus_config=$2 +db=$3 +bin=$4 +task_cpus=$5 +lineage_dataset_provided=$6 +busco_clean=$7 +extra_args=$8 + +# ensure augustus has write access to config directory +if [ ${cp_augustus_config} = "Y" ]; then + cp -r /usr/local/config/ augustus_config/ + export AUGUSTUS_CONFIG_PATH=augustus_config +fi + +# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) +if [ ${lineage_dataset_provided} = "Y" ]; then + mkdir dataset + mv ${db} dataset/ +fi + +# set nullgob: if pattern matches no files, expand to a null string rather than to itself +shopt -s nullglob + +# only used for saving busco downloads +most_spec_db="NA" + +if + busco ${p} \ + --mode genome \ + --in ${bin} \ + --cpu ${task_cpus} \ + ${extra_args} \ + --out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err +then + + # get name of used specific lineage dataset + summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) + if [ ${#summaries[@]} -ne 1 ]; then + echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." + exit 1 + fi + [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]] + db_name_spec="${BASH_REMATCH[1]}" + most_spec_db=${db_name_spec} + echo "Used specific lineage dataset: ${db_name_spec}" + + if [ ${lineage_dataset_provided} = "Y" ]; then + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting + # report bin as failed to allow consistent warnings within the pipeline for both settings + if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then + echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" + fi + else + # auto lineage selection + if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } || + { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log && + egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then + # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately + # and cause different log messages + echo "Domain and specific lineage could be selected by BUSCO." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + db_name_gen="" + summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) + if [ ${#summaries_gen[@]} -lt 1 ]; then + echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + db_name_gen=${db_name_spec} + else + [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]] + db_name_gen="${BASH_REMATCH[1]}" + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt + fi + + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then + # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? + echo "Domain could not be selected by BUSCO, but virus dataset was selected." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + else + echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 + exit 1 + fi + fi + + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz + break + done + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz + break + done + +elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then + echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" + +elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then + echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." + echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt" + + message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log) + [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]] + db_name_gen="${BASH_REMATCH[1]}" + most_spec_db=${db_name_gen} + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt + + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + +else + echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 + exit 1 +fi + +# additionally output genes predicted with Prodigal (GFF3) +if [ -f BUSCO/logs/prodigal_out.log ]; then + mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" +fi + +# output value of most_spec_db +echo ${most_spec_db} >info_most_spec_db.txt + +# if needed delete temporary BUSCO files +if [ ${busco_clean} = "Y" ]; then + find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \; + find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \; + find . -depth -type d -name "run_*" -execdir rm -rf "{}" + +fi diff --git a/bin/split_fasta.py b/bin/split_fasta.py new file mode 100755 index 00000000..c5fb6e87 --- /dev/null +++ b/bin/split_fasta.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +## Originally written by Daniel Straub and Sabrina Krakau and released +## under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +# USAGE: ./split_fasta.py <*.unbinned.fa(.gz)> + +import pandas as pd +import gzip +from sys import argv +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import generic_dna +import os +import re + +# Input +input_file = argv[1] +length_threshold = int(argv[2]) +max_sequences = int(argv[3]) +min_length_to_retain_contig = int(argv[4]) + +# Base name for file output +if input_file.endswith(".gz"): + rm_ext = input_file.replace(".gz", "") + out_base = out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", rm_ext) +else: + out_base = re.sub(r"\.fasta$|\.fa$|\.fna$", "", input_file) + +# Data structures to separate and store sequences +df_above_threshold = pd.DataFrame(columns=["id", "seq", "length"]) +pooled = [] +remaining = [] + +if input_file.endswith(".gz"): + with gzip.open(input_file, "rt") as f: + fasta_sequences = SeqIO.parse(f, "fasta") + + for fasta in fasta_sequences: + name, sequence = fasta.id, str(fasta.seq) + length = len(sequence) + + # store each sequence above threshold together with its length into df + if length >= length_threshold: + df_above_threshold = df_above_threshold.append( + {"id": name, "seq": sequence, "length": length}, ignore_index=True + ) + # contigs to retain and pool + elif length >= min_length_to_retain_contig: + pooled.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) + # remaining sequences + else: + remaining.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) +else: + with open(input_file) as f: + fasta_sequences = SeqIO.parse(f, "fasta") + + for fasta in fasta_sequences: + name, sequence = fasta.id, str(fasta.seq) + length = len(sequence) + + # store each sequence above threshold together with its length into df + if length >= length_threshold: + df_above_threshold = df_above_threshold.append( + {"id": name, "seq": sequence, "length": length}, ignore_index=True + ) + # contigs to retain and pool + elif length >= min_length_to_retain_contig: + pooled.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) + # remaining sequences + else: + remaining.append( + SeqRecord(Seq(sequence, generic_dna), id=name, description="") + ) + +# Sort sequences above threshold by length +df_above_threshold.sort_values(by=["length"], ascending=False, inplace=True) +df_above_threshold.reset_index(drop=True, inplace=True) + +# Write `max_sequences` longest sequences (above threshold) into separate files, add remainder to pooled +for index, row in df_above_threshold.iterrows(): + if index + 1 <= max_sequences: + print("write " + out_base + "." + str(index + 1) + ".fa") + out = SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") + SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") + else: + pooled.append( + SeqRecord(Seq(row["seq"], generic_dna), id=row["id"], description="") + ) + +print("write " + out_base + ".pooled.fa") +SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") +print("write " + out_base + ".remaining.fa") +SeqIO.write(remaining, out_base + ".remaining.fa", "fasta") diff --git a/bin/summary_busco.py b/bin/summary_busco.py new file mode 100755 index 00000000..9701783b --- /dev/null +++ b/bin/summary_busco.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python + +## Originally written by Daniel Straub, Sabrina Krakau, and Hadrien Gourlé +## and released under the MIT license. +## See git repository (https://github.com/nf-core/mag) for full license text. + +## USAGE: ./summary.busco.py -sd -ss -f + +import re +import sys +import argparse +import os.path +import pandas as pd + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-a", + "--auto", + default=False, + action="store_true", + help="BUSCO run in auto lineage selection mode.", + ) + parser.add_argument( + "-sd", + "--summaries_domain", + nargs="+", + metavar="FILE", + help="List of BUSCO summary files for domains.", + ) + parser.add_argument( + "-ss", + "--summaries_specific", + nargs="+", + metavar="FILE", + help="List of BUSCO summary files for specific lineages.", + ) + parser.add_argument( + "-f", + "--failed_bins", + nargs="+", + metavar="FILE", + help="List of files containing bin name for which BUSCO analysis failed.", + ) + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final BUSCO summary.", + ) + return parser.parse_args(args) + + +def main(args=None): + args = parse_args(args) + + if ( + not args.summaries_domain + and not args.summaries_specific + and not args.failed_bins + ): + sys.exit( + "Either --summaries_domain, --summaries_specific or --failed_bins must be specified!" + ) + + # "# Summarized benchmarking in BUSCO notation for file /path/to/MEGAHIT-testset1.contigs.fa" + # " C:0.0%[S:0.0%,D:0.0%],F:0.0%,M:100.0%,n:148" + + regexes = [ + r"# Summarized benchmarking in BUSCO notation for file (\S+)", + r"# The lineage dataset is: (\S+) \(", + r" C:(\S+)%\[S:", + r"%\[S:(\S+)%,D:", + r"%,D:(\S+)%\],F:", + r"%\],F:(\S+)%,M:", + r"%,M:(\S+)%,n:", + r"%,n:(\S+)", + ] + columns_domain = [ + "GenomeBin", + "Domain", + "%Complete (domain)", + "%Complete and single-copy (domain)", + "%Complete and duplicated (domain)", + "%Fragmented (domain)", + "%Missing (domain)", + "Total number (domain)", + ] + columns_specific = [ + "GenomeBin", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] + + if args.auto: + columns = [ + "GenomeBin", + "Domain", + "%Complete (domain)", + "%Complete and single-copy (domain)", + "%Complete and duplicated (domain)", + "%Fragmented (domain)", + "%Missing (domain)", + "Total number (domain)", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] + else: + columns = [ + "GenomeBin", + "Specific lineage dataset", + "%Complete (specific)", + "%Complete and single-copy (specific)", + "%Complete and duplicated (specific)", + "%Fragmented (specific)", + "%Missing (specific)", + "Total number (specific)", + ] + + # Search each summary file using its regex + results_domain = [] + if args.summaries_domain: + for file in args.summaries_domain: + with open(file) as infile: + results = [] + text = infile.read() + for index, regex in enumerate(regexes): + match = re.search(regex, text) + if match: + if index == 0: + results.append(os.path.basename(match.group(1))) + else: + results.append(match.group(1)) + results_domain.append(results) + df_domain = pd.DataFrame(results_domain, columns=columns_domain) + + results_specific = [] + if args.summaries_specific: + for file in args.summaries_specific: + with open(file) as infile: + results = [] + text = infile.read() + for index, regex in enumerate(regexes): + match = re.search(regex, text) + if match: + if index == 0: + results.append(os.path.basename(match.group(1))) + else: + results.append(match.group(1)) + results_specific.append(results) + df_specific = pd.DataFrame(results_specific, columns=columns_specific) + + # Add entries for bins with failed analysis (for domain and specific lineage where applicable) + failed = [] + if args.failed_bins: + for file in args.failed_bins: + with open(file) as infile: + line = infile.readline() + # in case of failed placements domain summary was used and specific part will be filled with NAs when merging + if re.split(r"[\t\n]", line)[1] != "Placements failed": + failed_bin = re.split(r"[\t\n]", line)[0] + if args.auto: + results = [ + failed_bin, + pd.NA, + "0.0", + "0.0", + "0.0", + "0.0", + "100.0", + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + else: + results = [ + failed_bin, + pd.NA, + "0.0", + "0.0", + "0.0", + "0.0", + "100.0", + pd.NA, + ] + failed.append(results) + df_failed = pd.DataFrame(failed, columns=columns) + + # merge results + if args.auto: + df_final = df_domain.merge(df_specific, on="GenomeBin", how="outer").append( + df_failed + ) + # check if 'Domain' is 'NA', but 'Specific lineage dataset' given -> 'Viruses' + df_final.loc[ + pd.isna(df_final["Domain"]) + & pd.notna(df_final["Specific lineage dataset"]), + "Domain", + ] = "Viruses" + + else: + df_final = df_specific.append(df_failed) + + df_final.to_csv(args.out, sep="\t", index=False) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/summary_gtdbtk.py b/bin/summary_gtdbtk.py new file mode 100755 index 00000000..370ea4fa --- /dev/null +++ b/bin/summary_gtdbtk.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python + +# Originally written by Sabrina Krakau and released under the MIT license. +# See git repository (https://github.com/nf-core/mag) for full license text. + +import re +import sys +import argparse +import os.path +import pandas as pd + + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument( + "-x", + "--extension", + required=True, + type=str, + help="File extension passed to GTDB-TK and substracted by GTDB-Tk from bin names in results files.", + ) + parser.add_argument( + "-s", + "--summaries", + nargs="+", + metavar="FILE", + help="List of GTDB-tk summary files.", + ) + parser.add_argument( + "-fi", + "--filtered_bins", + nargs="+", + metavar="FILE", + help="List of files containing names of bins which where filtered out during GTDB-tk analysis.", + ) + parser.add_argument( + "-fa", + "--failed_bins", + nargs="+", + metavar="FILE", + help="List of files containing bin names for which GTDB-tk analysis failed.", + ) + parser.add_argument( + "-d", + "--qc_discarded_bins", + nargs="+", + metavar="FILE", + type=str, + help="List of files containing names of bins which were discarded based on BUSCO metrics.", + ) + + parser.add_argument( + "-o", + "--out", + required=True, + metavar="FILE", + type=argparse.FileType("w"), + help="Output file containing final GTDB-tk summary.", + ) + return parser.parse_args(args) + + +def main(args=None): + args = parse_args(args) + + if ( + not args.summaries + and not args.filtered_bins + and not args.failed_bins + and not args.qc_discarded_bins + ): + sys.exit( + "Either --summaries, --filtered_bins, --failed_bins or --qc_discarded_bins must be specified!" + ) + + columns = [ + "user_genome", + "classification", + "closest_genome_reference", + "closest_genome_reference_radius", + "closest_genome_taxonomy", + "closest_genome_ani", + "closest_genome_af", + "closest_placement_reference", + "closest_placement_radius", + "closest_placement_taxonomy", + "closest_placement_ani", + "closest_placement_af", + "pplacer_taxonomy", + "classification_method", + "note", + "other_related_references(genome_id,species_name,radius,ANI,AF)", + "msa_percent", + "translation_table", + "red_value", + "warnings", + ] + # Note: currently all columns included + + # For bins already discarded based on BUSCO QC metrics + discarded = [] + if args.qc_discarded_bins: + for bin_name in args.qc_discarded_bins: + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + discarded.append(bin_results) + + df_final = pd.DataFrame(discarded, columns=columns) + df_final.set_index("user_genome", inplace=True) + + # For bins with succesfull GTDB-tk classification + if args.summaries: + for file in args.summaries: + df_summary = pd.read_csv(file, sep="\t")[columns] + # add by GTDB-Tk substracted file extension again to bin names (at least until changed consistently in rest of pipeline) + df_summary["user_genome"] = ( + df_summary["user_genome"].astype(str) + "." + args.extension + ) + df_summary.set_index("user_genome", inplace=True) + df_final = df_final.append(df_summary, verify_integrity=True) + + # For bins that were filtered out by GTDB-tk (e.g. due to insufficient number of AAs in MSA) + filtered = [] + if args.filtered_bins: + for file in args.filtered_bins: + df = pd.read_csv(file, sep="\t", names=["bin_name", "reason"]) + for index, row in df.iterrows(): + bin_name = row["bin_name"] + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + filtered.append(bin_results) + + df_filtered = pd.DataFrame(filtered, columns=columns) + df_filtered["user_genome"] = ( + df_filtered["user_genome"].astype(str) + "." + args.extension + ) + df_filtered.set_index("user_genome", inplace=True) + df_final = df_final.append(df_filtered, verify_integrity=True) + + # For bins for which GTDB-tk classification failed + failed = [] + if args.failed_bins: + for file in args.failed_bins: + df = pd.read_csv(file, sep="\t", names=["bin_name", "reason"]) + for index, row in df.iterrows(): + bin_name = row["bin_name"] + bin_results = [ + bin_name, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + pd.NA, + ] + failed.append(bin_results) + + df_failed = pd.DataFrame(failed, columns=columns) + df_failed["user_genome"] = ( + df_failed["user_genome"].astype(str) + "." + args.extension + ) + df_failed.set_index("user_genome", inplace=True) + df_final = df_final.append(df_failed, verify_integrity=True) + + # write output + df_final.reset_index().rename(columns={"index": "user_genome"}).to_csv( + args.out, sep="\t", index=False + ) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 023ce613..3bb06c8f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,13 +10,12 @@ process { - // TODO nf-core: Check the defaults for all processes - cpus = { 1 * task.attempt } - memory = { 6.GB * task.attempt } - time = { 4.h * task.attempt } + cpus = { 1 * task.attempt } + memory = { 7.GB * task.attempt } + time = { 4.h * task.attempt } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 3 maxErrors = '-1' // Process-specific resource requirements @@ -24,39 +23,151 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors - withLabel:process_single { - cpus = { 1 } + withLabel: process_single { + cpus = { 1 } memory = { 6.GB * task.attempt } - time = { 4.h * task.attempt } + time = { 4.h * task.attempt } } - withLabel:process_low { - cpus = { 2 * task.attempt } + withLabel: process_low { + cpus = { 2 * task.attempt } memory = { 12.GB * task.attempt } - time = { 4.h * task.attempt } + time = { 4.h * task.attempt } } - withLabel:process_medium { - cpus = { 6 * task.attempt } + withLabel: process_medium { + cpus = { 6 * task.attempt } memory = { 36.GB * task.attempt } - time = { 8.h * task.attempt } + time = { 8.h * task.attempt } } - withLabel:process_high { - cpus = { 12 * task.attempt } + withLabel: process_high { + cpus = { 12 * task.attempt } memory = { 72.GB * task.attempt } - time = { 16.h * task.attempt } + time = { 16.h * task.attempt } } - withLabel:process_long { - time = { 20.h * task.attempt } + withLabel: process_long { + time = { 20.h * task.attempt } } - withLabel:process_high_memory { + withLabel: process_high_memory { memory = { 200.GB * task.attempt } } - withLabel:error_ignore { + withLabel: error_ignore { errorStrategy = 'ignore' } - withLabel:error_retry { + withLabel: error_retry { errorStrategy = 'retry' maxRetries = 2 } + + withName: BOWTIE2_HOST_REMOVAL_BUILD { + cpus = { 10 * task.attempt } + memory = { 20.GB * task.attempt } + time = { 4.h * task.attempt } + } + withName: BOWTIE2_HOST_REMOVAL_ALIGN { + cpus = { 10 * task.attempt } + memory = { 10.GB * task.attempt } + time = { 6.h * task.attempt } + } + withName: BOWTIE2_PHIX_REMOVAL_ALIGN { + cpus = { 4 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 6.h * task.attempt } + } + withName: PORECHOP_PORECHOP { + cpus = { 4 * task.attempt } + memory = { 30.GB * task.attempt } + time = { 4.h * task.attempt } + } + withName: NANOLYSE { + cpus = { 2 * task.attempt } + memory = { 10.GB * task.attempt } + time = { 3.h * task.attempt } + } + //filtlong: exponential increase of memory and time with attempts + withName: FILTLONG { + cpus = { 8 * task.attempt } + memory = { 64.GB * (2 ** (task.attempt - 1)) } + time = { 24.h * (2 ** (task.attempt - 1)) } + } + withName: CENTRIFUGE_CENTRIFUGE { + cpus = { 8 * task.attempt } + memory = { 40.GB * task.attempt } + time = { 12.h * task.attempt } + } + withName: KRAKEN2 { + cpus = { 8 * task.attempt } + memory = { 40.GB * task.attempt } + time = { 12.h * task.attempt } + } + withName: KRONA_KTIMPORTTAXONOMY { + cpus = { 8 * task.attempt } + memory = { 20.GB * task.attempt } + time = { 12.h * task.attempt } + } + withName: CAT_DB_GENERATE { + memory = { 200.GB * task.attempt } + time = { 16.h * task.attempt } + } + withName: CAT { + cpus = { 8 * task.attempt } + memory = { 40.GB * task.attempt } + time = { 12.h * task.attempt } + } + withName: GTDBTK_CLASSIFYWF { + cpus = { 10 * task.attempt } + memory = { 128.GB * task.attempt } + time = { 12.h * task.attempt } + } + //MEGAHIT returns exit code 250 when running out of memory + withName: MEGAHIT { + cpus = { params.megahit_fix_cpu_1 ? 1 : (8 * task.attempt) } + memory = { 40.GB * task.attempt } + time = { 16.h * task.attempt } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 250) ? 'retry' : 'finish' } + } + //SPAdes returns error(1) if it runs out of memory (and for other reasons as well...)! + //exponential increase of memory and time with attempts, keep number of threads to enable reproducibility + withName: SPADES { + cpus = { params.spades_fix_cpus != -1 ? params.spades_fix_cpus : (10 * task.attempt) } + memory = { 64.GB * (2 ** (task.attempt - 1)) } + time = { 24.h * (2 ** (task.attempt - 1)) } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 21 + 12 + 1) ? 'retry' : 'finish' } + maxRetries = 5 + } + withName: SPADESHYBRID { + cpus = { params.spadeshybrid_fix_cpus != -1 ? params.spadeshybrid_fix_cpus : (10 * task.attempt) } + memory = { 64.GB * (2 ** (task.attempt - 1)) } + time = { 24.h * (2 ** (task.attempt - 1)) } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 21 + 12 + 1) ? 'retry' : 'finish' } + maxRetries = 5 + } + //returns exit code 247 when running out of memory + withName: BOWTIE2_ASSEMBLY_ALIGN { + cpus = { 2 * task.attempt } + memory = { 8.GB * task.attempt } + time = { 8.h * task.attempt } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 247) ? 'retry' : 'finish' } + } + withName: METABAT2_METABAT2 { + cpus = { 8 * task.attempt } + memory = { 20.GB * task.attempt } + time = { 8.h * task.attempt } + } + withName: MAG_DEPTHS { + memory = { 16.GB * task.attempt } + } + withName: BUSCO { + cpus = { 8 * task.attempt } + memory = { 20.GB * task.attempt } + } + withName: MAXBIN2 { + errorStrategy = { task.exitStatus in [1, 255] ? 'ignore' : 'retry' } + } + withName: DASTOOL_DASTOOL { + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } + } + //CheckM2 returns exit code 1 when Diamond doesn't find any hits + withName: CHECKM2_PREDICT { + errorStrategy = { task.exitStatus in (130..145) ? 'retry' : task.exitStatus == 1 ? 'ignore' : 'finish' } + } } diff --git a/conf/modules.config b/conf/modules.config index d203d2b6..701598db 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,17 +12,709 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + //default: do not publish into the results folder + publishDir = [path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: false] - withName: FASTQC { - ext.args = '--quiet' + withName: FASTQC_RAW { + ext.args = '--quiet' + publishDir = [path: { "${params.outdir}/QC_shortreads/fastqc" }, mode: params.publish_dir_mode, pattern: "*.html"] + ext.prefix = { "${meta.id}_run${meta.run}_raw" } + tag = { "${meta.id}_run${meta.run}_raw" } } - withName: 'MULTIQC' { + withName: FASTP { + ext.args = [ + "-q ${params.fastp_qualified_quality}", + "--cut_front", + "--cut_tail", + "--cut_mean_quality ${params.fastp_cut_mean_quality}", + "--length_required ${params.reads_minlength}" + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/QC_shortreads/fastp/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{html,json}" + ], + [ + path: { "${params.outdir}/QC_shortreads/fastp/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + enabled: params.save_clipped_reads + ] + ] + ext.prefix = { "${meta.id}_run${meta.run}_fastp" } + tag = { "${meta.id}_run${meta.run}" } + } + + withName: ADAPTERREMOVAL_PE { + ext.args = [ + "--minlength ${params.reads_minlength}", + "--adapter1 ${params.adapterremoval_adapter1} --adapter2 ${params.adapterremoval_adapter2}", + "--minquality ${params.adapterremoval_minquality} --trimns", + params.adapterremoval_trim_quality_stretch ? "--trim_qualities" : "--trimwindows 4" + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/QC_shortreads/adapterremoval/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{settings}" + ], + [ + path: { "${params.outdir}/QC_shortreads/adapterremoval/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{truncated,discarded}.gz", + enabled: params.save_clipped_reads + ] + ] + ext.prefix = { "${meta.id}_run${meta.run}_ar2" } + tag = { "${meta.id}_run${meta.run}" } + } + + withName: ADAPTERREMOVAL_SE { + ext.args = [ + "--minlength ${params.reads_minlength}", + "--adapter1 ${params.adapterremoval_adapter1}", + "--minquality ${params.adapterremoval_minquality} --trimns", + params.adapterremoval_trim_quality_stretch ? "--trim_qualities" : "--trimwindows 4" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/QC_shortreads/adapterremoval/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{settings}" + ] + ext.prefix = { "${meta.id}_run${meta.run}_ar2" } + tag = { "${meta.id}_run${meta.run}" } + } + + withName: BOWTIE2_PHIX_REMOVAL_ALIGN { + ext.prefix = { "${meta.id}_run${meta.run}_phix_removed" } + publishDir = [ + [ + path: { "${params.outdir}/QC_shortreads/remove_phix" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/QC_shortreads/remove_phix" }, + mode: params.publish_dir_mode, + pattern: "*.unmapped*.fastq.gz", + enabled: params.save_phixremoved_reads + ] + ] + tag = { "${meta.id}_run${meta.run}" } + } + + withName: BOWTIE2_HOST_REMOVAL_ALIGN { + ext.args = params.host_removal_verysensitive ? "--very-sensitive" : "--sensitive" + ext.args2 = params.host_removal_save_ids ? "--host_removal_save_ids" : '' + ext.prefix = { "${meta.id}_run${meta.run}_host_removed" } + publishDir = [ + [ + path: { "${params.outdir}/QC_shortreads/remove_host" }, + mode: params.publish_dir_mode, + pattern: "*{.log,read_ids.txt}" + ], + [ + path: { "${params.outdir}/QC_shortreads/remove_host" }, + mode: params.publish_dir_mode, + pattern: "*.unmapped*.fastq.gz", + enabled: params.save_hostremoved_reads + ] + ] + tag = { "${meta.id}_run${meta.run}" } + } + + withName: FASTQC_TRIMMED { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_run${meta.run}_trimmed" } + publishDir = [ + path: { "${params.outdir}/QC_shortreads/fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.html" + ] + tag = { "${meta.id}_run${meta.run}" } + } + + withName: BBMAP_BBNORM { + ext.args = [ + params.bbnorm_target ? "target=${params.bbnorm_target}" : '', + params.bbnorm_min ? "min=${params.bbnorm_min}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/bbmap/bbnorm/logs" }, + enabled: params.save_bbnorm_reads, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/bbmap/bbnorm/" }, + mode: 'copy', + enabled: params.save_bbnorm_reads, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" + ] + ] + } + + withName: PORECHOP_PORECHOP { + publishDir = [ + path: { "${params.outdir}/QC_longreads/porechop" }, + mode: params.publish_dir_mode, + pattern: "*_porechop_trimmed.fastq.gz", + enabled: params.save_porechop_reads + ] + ext.prefix = { "${meta.id}_run${meta.run}_porechop_trimmed" } + } + + withName: PORECHOP_ABI { + publishDir = [ + path: { "${params.outdir}/QC_longreads/porechop" }, + mode: params.publish_dir_mode, + pattern: "*_porechop-abi_trimmed.fastq.gz", + enabled: params.save_porechop_reads + ] + ext.prefix = { "${meta.id}_run${meta.run}_porechop-abi_trimmed" } + } + + withName: FILTLONG { + ext.args = [ + "--min_length ${params.longreads_min_length}", + "--keep_percent ${params.longreads_keep_percent}", + "--trim", + "--length_weight ${params.longreads_length_weight}", + params.longreads_min_quality ? "--min_mean_q ${params.longreads_min_quality}" : '' + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/QC_longreads/Filtlong" }, + mode: params.publish_dir_mode, + pattern: "*_filtlong.fastq.gz", + enabled: params.save_filtered_longreads + ] + ext.prefix = { "${meta.id}_run${meta.run}_filtlong" } + } + + withName: NANOQ { + ext.args = [ + "--min-len ${params.longreads_min_length}", + params.longreads_min_quality ? "--min-qual ${params.longreads_min_quality}" : '', + "-vv" + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/QC_longreads/Nanoq" }, + mode: params.publish_dir_mode, + pattern: "*_nanoq_filtered.fastq.gz", + enabled: params.save_filtered_longreads + ], + [ + path: { "${params.outdir}/QC_longreads/Nanoq" }, + mode: params.publish_dir_mode, + pattern: "*_nanoq_filtered.stats" + ] + ] + ext.prefix = { "${meta.id}_run${meta.run}_nanoq_filtered" } + } + + withName: NANOLYSE { + publishDir = [ + [ + path: { "${params.outdir}/QC_longreads/NanoLyse" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/QC_longreads/NanoLyse" }, + mode: params.publish_dir_mode, + pattern: "*_nanolyse.fastq.gz", + enabled: params.save_lambdaremoved_reads + ] + ] + ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" } + } + + withName: CHOPPER { + ext.args2 = [ + params.longreads_min_quality ? "--quality ${params.longreads_min_quality}" : '', + params.longreads_min_length ? "--minlength ${params.longreads_min_length}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/QC_longreads/Chopper" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/QC_longreads/Chopper" }, + mode: params.publish_dir_mode, + pattern: "*_chopper.fastq.gz", + enabled: params.save_lambdaremoved_reads || params.save_filtered_longreads + ] + ] + ext.prefix = { "${meta.id}_run${meta.run}_chopper" } + } + + withName: NANOPLOT_RAW { + ext.prefix = 'raw' + ext.args = { + [ + "-p raw_", + "--title ${meta.id}_raw", + "-c darkblue" + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/QC_longreads/NanoPlot/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{png,html,txt}" + ] + } + + withName: NANOPLOT_FILTERED { + ext.args = { + [ + "-p filtered_", + "--title ${meta.id}_filtered", + "-c darkblue" + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/QC_longreads/NanoPlot/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{png,html,txt}" + ] + } + + withName: CENTRIFUGE_CENTRIFUGE { + publishDir = [path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt"] + } + + withName: CENTRIFUGE_KREPORT { + ext.prefix = { "${meta.id}_kreport" } + publishDir = [path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt"] + } + + withName: KRAKEN2 { + ext.args = '--quiet' + publishDir = [ + path: { "${params.outdir}/Taxonomy/kraken2/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + } + + withName: KREPORT2KRONA_CENTRIFUGE { + publishDir = [path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt", enabled: false] + } + + withName: KRONA_KTIMPORTTAXONOMY { + publishDir = [path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.html"] + } + + withName: MEGAHIT { + ext.args = { params.megahit_options ? params.megahit_options + " -m ${task.memory.toBytes()}" : "-m ${task.memory.toBytes()}" } + ext.prefix = { "MEGAHIT-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/MEGAHIT" }, mode: params.publish_dir_mode, pattern: "*.{fa.gz,log}"] + } + + withName: METASPADES { + ext.args = params.spades_options ? params.spades_options + ' --meta' : '--meta' + ext.prefix = { "SPAdes-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/SPAdes" }, mode: params.publish_dir_mode, pattern: "*.{fasta.gz,gfa.gz,fa.gz,log}"] + } + + withName: METASPADESHYBRID { + ext.args = params.spades_options ? params.spades_options + ' --meta' : '--meta' + ext.prefix = { "SPAdesHybrid-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/SPAdesHybrid" }, mode: params.publish_dir_mode, pattern: "*.{fasta.gz,gfa.gz,fa.gz,log}"] + } + + withName: QUAST { + publishDir = [path: { "${params.outdir}/Assembly/${meta.assembler}/QC/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: GENOMAD_ENDTOEND { + ext.args = [ + "--cleanup", + "--min-score ${params.genomad_min_score}", + "--splits ${params.genomad_splits}" + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/VirusIdentification/geNomad/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: BOWTIE2_ASSEMBLY_ALIGN { + ext.args = params.bowtie2_mode ? params.bowtie2_mode : params.ancient_dna ? '--very-sensitive -N 1' : '' + ext.prefix = { "${meta.id}.assembly" } + publishDir = [ + [ + path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path: { "${params.outdir}/Assembly/${assembly_meta.assembler}/QC/${assembly_meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{bam,bai}", + enabled: params.save_assembly_mapped_reads + ] + ] + } + + withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY' { + publishDir = [path: { "${params.outdir}/GenomeBinning/depths/bins" }, mode: params.publish_dir_mode, pattern: "*.{png,tsv}"] + } + + withName: BIN_SUMMARY { + publishDir = [ + path: { "${params.outdir}/GenomeBinning" }, + mode: params.publish_dir_mode, + pattern: "*.{png,tsv}" + ] + } + + withName: BUSCO_DB_PREPARATION { + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, pattern: "*.tar.gz"] + } + + withName: BUSCO { + ext.args = [ + params.busco_db ? '--offline' : '' + ].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, + mode: params.publish_dir_mode, + pattern: "*.{log,err,faa.gz,fna.gz,gff,txt}" + ] + } + + withName: BUSCO_SAVE_DOWNLOAD { + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: 'BUSCO_SUMMARY|QUAST_BINS|QUAST_BINS_SUMMARY' { + publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: ARIA2_UNTAR { + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm_data] + } + + withName: CHECKM_LINEAGEWF { + tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHECKM_QA { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_qa" } + ext.args = "-o 2 --tab_table" + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: COMBINE_BINQC_TSV { + ext.prefix = { "${params.binqc_tool}_summary" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CHECKM2_DATABASEDOWNLOAD { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM2/checkm2_downloads" }, + mode: params.publish_dir_mode, + overwrite: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.save_checkm2_data + ] + } + + withName: CHECKM2_PREDICT { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/CheckM2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: GUNC_DOWNLOADDB { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: params.gunc_save_db + ] + } + + // Make sure to keep directory in sync with gunc_qc.nf + withName: GUNC_RUN { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC/raw/${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}/${fasta.baseName}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // Make sure to keep directory in sync with gunc_qc.nf + withName: GUNC_MERGECHECKM { + publishDir = [ + path: { "${params.outdir}/GenomeBinning/QC/GUNC/checkmmerged/${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}/${checkm_file.baseName}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: CAT_DB_GENERATE { + publishDir = [path: { "${params.outdir}/Taxonomy/CAT" }, mode: params.publish_dir_mode, pattern: "*.tar.gz"] + } + + withName: CAT { + publishDir = [path: { "${params.outdir}/Taxonomy/CAT/${meta.assembler}/${meta.binner}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + withName: CAT_SUMMARY { + ext.prefix = "cat_summary" + publishDir = [path: { "${params.outdir}/Taxonomy/CAT/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: GTDBTK_CLASSIFYWF { + ext.args = [ + "--extension fa", + "--min_perc_aa ${params.gtdbtk_min_perc_aa}", + "--min_af ${params.gtdbtk_min_af}", + "--pplacer_cpus ${params.gtdbtk_pplacer_cpus}" + ].join(' ') + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/Taxonomy/GTDB-Tk/${meta.assembler}/${meta.binner}/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*.{log,tsv,tree.gz,fasta,fasta.gz}" + ] + } + + withName: GTDBTK_SUMMARY { + ext.args = "--extension fa" + publishDir = [path: { "${params.outdir}/Taxonomy/GTDB-Tk" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: PROKKA { + ext.args = { params.prokka_with_compliance ? "--metagenome --compliant --centre ${params.prokka_compliance_centre}" : "--metagenome" } + publishDir = [path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: PRODIGAL { + ext.args = "-p meta" + ext.prefix = { "${meta.assembler}-${meta.id}_prodigal" } + publishDir = [path: { "${params.outdir}/Annotation/Prodigal/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + + withName: FREEBAYES { + ext.prefix = { "${meta.assembler}-${meta.id}" } + ext.args = "-p ${params.freebayes_ploidy} -q ${params.freebayes_min_basequality} -F ${params.freebayes_minallelefreq}" + publishDir = [path: { "${params.outdir}/Ancient_DNA/variant_calling/freebayes" }, mode: params.publish_dir_mode, pattern: "*.vcf.gz"] + } + + withName: BCFTOOLS_VIEW { + ext.prefix = { "${meta.assembler}-${meta.id}.filtered" } + ext.args = "-v snps,mnps -i 'QUAL>=${params.bcftools_view_high_variant_quality} || (QUAL>=${params.bcftools_view_medium_variant_quality} && FORMAT/AO>=${params.bcftools_view_minimal_allelesupport})'" + publishDir = [path: { "${params.outdir}/Ancient_DNA/variant_calling/filtered" }, mode: params.publish_dir_mode, pattern: "*.vcf.gz"] + } + + withName: BCFTOOLS_CONSENSUS { + ext.prefix = { "${meta.assembler}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/Ancient_DNA/variant_calling/consensus" }, + mode: params.publish_dir_mode, + pattern: "*.fa" + ] + } + + withName: BCFTOOLS_INDEX { + ext.prefix = { "${meta.assembler}-${meta.id}" } + ext.args = "-t" + publishDir = [ + path: { "${params.outdir}/Ancient_DNA/variant_calling/index" }, + mode: params.publish_dir_mode, + enabled: false + ] + } + + withName: PYDAMAGE_ANALYZE { + ext.prefix = { "${meta.assembler}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/Ancient_DNA/pydamage/analyze/${meta.assembler}-${meta.id}/" }, + mode: params.publish_dir_mode + ] + } + + withName: PYDAMAGE_FILTER { + ext.prefix = { "${meta.assembler}-${meta.id}" } + ext.args = "-t ${params.pydamage_accuracy}" + publishDir = [ + path: { "${params.outdir}/Ancient_DNA/pydamage/filter/${meta.assembler}-${meta.id}/" }, + mode: params.publish_dir_mode + ] + } + + withName: SAMTOOLS_FAIDX { + ext.prefix = { "${meta.assembler}-${meta.id}" } + publishDir = [ + path: { "${params.outdir}/Ancient_DNA/samtools/faidx" }, + mode: params.publish_dir_mode, + enabled: false + ] + } + + withName: METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS { + publishDir = [path: { "${params.outdir}/GenomeBinning/depths/contigs" }, mode: params.publish_dir_mode, pattern: '*-depth.txt.gz'] + ext.prefix = { "${meta.assembler}-${meta.id}-depth" } + } + + withName: METABAT2_METABAT2 { + publishDir = [[path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" }, mode: params.publish_dir_mode, pattern: '*[!lowDepth|tooShort|unbinned].fa.gz'], [path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, mode: params.publish_dir_mode, pattern: '*tooShort.fa.gz'], [path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, mode: params.publish_dir_mode, pattern: '*lowDepth.fa.gz']] + ext.prefix = { "${meta.assembler}-MetaBAT2-${meta.id}" } + ext.args = [ + params.min_contig_size < 1500 ? "-m 1500" : "-m ${params.min_contig_size}", + "--unbinned", + "--seed ${params.metabat_rng_seed}" + ].join(' ').trim() + } + + withName: MAXBIN2 { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/MaxBin2/discarded" }, + mode: params.publish_dir_mode, + pattern: '*.tooshort.gz' + ], + [ + path: { "${params.outdir}/GenomeBinning/MaxBin2/" }, + mode: params.publish_dir_mode, + pattern: '*.{summary,abundance}' + ] + ] + ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" } + } + + withName: ADJUST_MAXBIN2_EXT { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/MaxBin2/bins/" }, + mode: params.publish_dir_mode, + pattern: '*.fa.gz' + ] + ] + } + + withName: 'CONCOCT_.*' { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/CONCOCT/stats/" }, + mode: params.publish_dir_mode, + pattern: "*.{txt,csv,tsv}" + ], + [ + path: { "${params.outdir}/GenomeBinning/CONCOCT/bins" }, + mode: params.publish_dir_mode, + saveAs: { filename -> new File(filename).getName() }, + pattern: "*/*.fa.gz" + ] + ] + ext.prefix = { "${meta.assembler}-CONCOCT-${meta.id}" } + } + + withName: SPLIT_FASTA { + publishDir = [[path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned" }, mode: params.publish_dir_mode, pattern: '*.*[0-9].fa.gz'], [path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, mode: params.publish_dir_mode, pattern: '*.pooled.fa.gz'], [path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, mode: params.publish_dir_mode, pattern: '*.remaining.fa.gz']] + } + + withName: DASTOOL_FASTATOCONTIG2BIN_METABAT2 { + ext.prefix = { "${meta.assembler}-MetaBAT2-${meta.id}" } + } + + withName: DASTOOL_FASTATOCONTIG2BIN_MAXBIN2 { + ext.prefix = { "${meta.assembler}-MaxBin2-${meta.id}" } + } + + withName: DASTOOL_FASTATOCONTIG2BIN_CONCOCT { + ext.prefix = { "${meta.assembler}-CONCOCT-${meta.id}" } + } + + withName: DASTOOL_FASTATOCONTIG2BIN_TIARA { + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.id}" } + } + + withName: DASTOOL_DASTOOL { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/DASTool" }, + mode: params.publish_dir_mode, + pattern: '*.{tsv,log,eval,seqlength}' + ] + ] + ext.prefix = { "${meta.assembler}-DASTool-${meta.id}" } + ext.args = "--write_bins --write_unbinned --write_bin_evals --score_threshold ${params.refine_bins_dastool_threshold}" + } + + withName: RENAME_POSTDASTOOL { + publishDir = [ + [ + path: { "${params.outdir}/GenomeBinning/DASTool/unbinned" }, + mode: params.publish_dir_mode, + pattern: '*-DASToolUnbinned-*.fa' + ], + [ + path: { "${params.outdir}/GenomeBinning/DASTool/bins" }, + mode: params.publish_dir_mode, + pattern: '*-{MetaBAT2,MaxBin2,CONCOCT}Refined-*.fa' + ] + ] + } + + withName: TIARA_TIARA { + publishDir = [ + path: { "${params.outdir}/Taxonomy/Tiara/" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + ext.args = { "--min_len ${params.tiara_min_length} --probabilities" } + ext.prefix = { "${meta.assembler}-${meta.id}.tiara" } + } + + withName: TIARA_CLASSIFY { + ext.args = { "--join_prokaryotes --assembler ${meta.assembler}" } + ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.bin}-${meta.id}" } + } + + withName: TIARA_SUMMARY { + publishDir = [path: { "${params.outdir}/GenomeBinning/Tiara" }, mode: params.publish_dir_mode, pattern: "tiara_summary.tsv"] + ext.prefix = "tiara_summary" + } + + withName: MMSEQS_DATABASES { + ext.prefix = { "${params.metaeuk_mmseqs_db.replaceAll("/", "-")}" } + publishDir = [path: { "${params.outdir}/Annotation/mmseqs_db/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_mmseqs_db] + } + + withName: METAEUK_EASYPREDICT { + ext.args = "" + ext.prefix = { "${meta.id}" } + publishDir = [path: { "${params.outdir}/Annotation/MetaEuk/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] + } + withName: MULTIQC { ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ path: { "${params.outdir}/multiqc" }, @@ -30,5 +722,4 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - } diff --git a/conf/test.config b/conf/test.config index d7280238..04fced63 100644 --- a/conf/test.config +++ b/conf/test.config @@ -19,12 +19,19 @@ process { } params { - config_profile_name = 'Test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'// Genome references - genome = 'R64-1-1' + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.multirun.csv' + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + skip_krona = false + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + busco_clean = true + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_concoct = true } diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config new file mode 100644 index 00000000..fc433a8b --- /dev/null +++ b/conf/test_adapterremoval.config @@ -0,0 +1,40 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile for running with AdapterRemoval and domain classification' + config_profile_description = 'Minimal test dataset to check pipeline function with AdapterRemoval data and domain classification.' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.euk.csv' + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + metaeuk_db = params.pipelines_testdata_base_path + '/modules/data/proteomics/database/yeast_UPS.fasta' + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + binqc_tool = 'checkm' + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + clip_tool = 'adapterremoval' + skip_concoct = true + bin_domain_classification = true +} diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config new file mode 100644 index 00000000..e8dab425 --- /dev/null +++ b/conf/test_ancient_dna.config @@ -0,0 +1,47 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_ancient_dna, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + + +params { + config_profile_name = 'Ancient DNA test profile ' + config_profile_description = 'Minimal test dataset to check pipeline function for ancient DNA step' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + ancient_dna = true + binning_map_mode = 'own' + skip_spades = false + skip_spadeshybrid = true + bcftools_view_high_variant_quality = 0 + bcftools_view_medium_variant_quality = 0 + bcftools_view_minimal_allelesupport = 3 + refine_bins_dastool = true + refine_bins_dastool_threshold = 0 + skip_concoct = true +} diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config new file mode 100644 index 00000000..a434f584 --- /dev/null +++ b/conf/test_bbnorm.config @@ -0,0 +1,44 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + keep_phix = true + skip_clipping = true + skip_prokka = true + skip_prodigal = true + skip_quast = true + skip_binning = true + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + binqc_tool = 'checkm2' + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + bbnorm = true + coassemble_group = true +} diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config new file mode 100644 index 00000000..9602197c --- /dev/null +++ b/conf/test_binrefinement.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_binrefinement, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + assembly_input = params.pipelines_testdata_base_path + 'mag/samplesheets/assembly_samplesheet.csv' + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + skip_krona = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + refine_bins_dastool = true + refine_bins_dastool_threshold = 0 + // TODO not using 'both' until #489 merged + postbinning_input = 'refined_bins_only' + busco_clean = true +} diff --git a/conf/test_busco_auto.config b/conf/test_busco_auto.config new file mode 100644 index 00000000..902a8d89 --- /dev/null +++ b/conf/test_busco_auto.config @@ -0,0 +1,37 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_busco_auto, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + skip_spades = true + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_prokka = true + skip_prodigal = true + skip_quast = true + skip_concoct = true +} diff --git a/conf/test_concoct.config b/conf/test_concoct.config new file mode 100644 index 00000000..2d90ab50 --- /dev/null +++ b/conf/test_concoct.config @@ -0,0 +1,47 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Runs input data but skipping all possible steps to allow for a fast testing + profile for input checks etc. + + Use as follows: + nextflow run nf-core/mag -profile test_nothing, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test CONCOCT profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + centrifuge_db = null + kraken2_db = null + skip_krona = true + skip_clipping = true + skip_adapter_trimming = false + skip_spades = true + skip_spadeshybrid = true + skip_megahit = false + skip_quast = true + skip_prodigal = true + skip_binning = false + skip_metabat2 = false + skip_maxbin2 = true + skip_concoct = false + skip_prokka = true + skip_binqc = true + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 +} diff --git a/conf/test_full.config b/conf/test_full.config index ab9f6db8..5db195fe 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,10 +15,35 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // hg19 reference with highly conserved and low-complexity regions masked by Brian Bushnell + host_fasta = "s3://ngi-igenomes/test-data/mag/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz" + input = "s3://ngi-igenomes/test-data/mag/samplesheets/samplesheet.full.csv" - // Genome references - genome = 'R64-1-1' + //centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz" + kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz" + cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz" + // gtdb_db = "s3://ngi-igenomes/test-data/mag/gtdbtk_r214_data.tar.gz" ## This should be updated to release 220, once we get GTDB-Tk working again + skip_gtdbtk = true + + // TODO TEMPORARY: deactivate SPAdes due to incompatibility of container with fusion file system + skip_spades = true + skip_spadeshybrid = true + + // reproducibility options for assembly + spades_fix_cpus = 10 + spadeshybrid_fix_cpus = 10 + megahit_fix_cpu_1 = true + + // available options to enable reproducibility for BUSCO (--busco_db) not used here + // to allow detection of possible problems in automated lineage selection mode using public databases + + // test CAT with official taxonomic ranks only + cat_official_taxonomy = true + + // Skip CONCOCT due to timeout issues + skip_concoct = true + + // Set Prokka compliance mode to allow metaSPAdes bins to be annotated + prokka_with_compliance = true + prokka_compliance_centre = "nfcore" } diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config new file mode 100644 index 00000000..e241e03e --- /dev/null +++ b/conf/test_host_rm.config @@ -0,0 +1,35 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_host_rm, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + host_fasta = params.pipelines_testdata_base_path + 'mag/host_reference/genome.hg38.chr21_10000bp_region.fa' + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.host_rm.csv' + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_concoct = true +} diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config new file mode 100644 index 00000000..cfb0991c --- /dev/null +++ b/conf/test_hybrid.config @@ -0,0 +1,34 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_hybrid, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.hybrid.csv' + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz" + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_concoct = true +} diff --git a/conf/test_hybrid_host_rm.config b/conf/test_hybrid_host_rm.config new file mode 100644 index 00000000..9ffd3dc7 --- /dev/null +++ b/conf/test_hybrid_host_rm.config @@ -0,0 +1,35 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_hybrid_host_rm, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + host_fasta = params.pipelines_testdata_base_path + 'mag/host_reference/genome.hg38.chr21_10000bp_region.fa' + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.hybrid_host_rm.csv' + min_length_unbinned_contigs = 1 + max_unbinned_contigs = 2 + skip_binqc = true + skip_concoct = true + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 +} diff --git a/conf/test_nothing.config b/conf/test_nothing.config new file mode 100644 index 00000000..0270218f --- /dev/null +++ b/conf/test_nothing.config @@ -0,0 +1,48 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Runs input data but skipping all possible steps to allow for a fast testing + profile for input checks etc. + + Use as follows: + nextflow run nf-core/mag -profile test_nothing, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test nothing profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + centrifuge_db = null + kraken2_db = null + skip_krona = true + skip_clipping = true + skip_adapter_trimming = true + skip_spades = true + skip_spadeshybrid = true + skip_megahit = true + skip_quast = true + skip_prodigal = true + skip_binning = true + skip_metabat2 = true + skip_maxbin2 = true + skip_concoct = true + skip_prokka = true + skip_binqc = true + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_concoct = true +} diff --git a/conf/test_single_end.config b/conf/test_single_end.config new file mode 100644 index 00000000..951a4361 --- /dev/null +++ b/conf/test_single_end.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_single_end, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test single-end profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.single_end.csv' + single_end = true + centrifuge_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_cf.tar.gz' + kraken2_db = params.pipelines_testdata_base_path + 'mag/test_data/minigut_kraken.tgz' + skip_krona = true + megahit_fix_cpu_1 = true + spades_fix_cpus = 1 + binning_map_mode = 'own' + min_length_unbinned_contigs = 1000000 + max_unbinned_contigs = 2 + skip_gtdbtk = true + skip_concoct = true + skip_binqc = true + skip_gtdbtk = true + skip_prokka = true + skip_metaeuk = true +} diff --git a/conf/test_virus_identification.config b/conf/test_virus_identification.config new file mode 100644 index 00000000..380401b3 --- /dev/null +++ b/conf/test_virus_identification.config @@ -0,0 +1,47 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/mag -profile test_virus_identification, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// Limit resources so that this can run on GitHub Actions +process { + resourceLimits = [ + cpus: 4, + memory: '15.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile for running virus_identification' + config_profile_description = 'Minimal test dataset to check pipeline function virus identification' + + // Input data + input = params.pipelines_testdata_base_path + 'mag/samplesheets/samplesheet.csv' + run_virus_identification = true + genomad_splits = 7 + + // For computational efficiency + reads_minlength = 150 + coassemble_group = true + skip_gtdbtk = true + gtdbtk_min_completeness = 0.01 + skip_binning = true + skip_prokka = true + skip_spades = true + skip_spadeshybrid = true + skip_quast = true + skip_prodigal = true + skip_krona = true + skip_adapter_trimming = true + skip_metabat2 = true + skip_maxbin2 = true + skip_busco = true +} diff --git a/docs/images/mag_logo.png b/docs/images/mag_logo.png new file mode 100644 index 00000000..d19f242d Binary files /dev/null and b/docs/images/mag_logo.png differ diff --git a/docs/images/mag_logo.svg b/docs/images/mag_logo.svg new file mode 100644 index 00000000..d2203770 --- /dev/null +++ b/docs/images/mag_logo.svg @@ -0,0 +1,205 @@ + +image/svg+xmlnf- +core/ +mag + \ No newline at end of file diff --git a/docs/images/mag_logo_mascot_dark.png b/docs/images/mag_logo_mascot_dark.png new file mode 100644 index 00000000..4ec2f131 Binary files /dev/null and b/docs/images/mag_logo_mascot_dark.png differ diff --git a/docs/images/mag_logo_mascot_dark.svg b/docs/images/mag_logo_mascot_dark.svg new file mode 100644 index 00000000..37eb8387 --- /dev/null +++ b/docs/images/mag_logo_mascot_dark.svg @@ -0,0 +1,331 @@ + +image/svg+xml + + +nf-core/mag diff --git a/docs/images/mag_logo_mascot_light.png b/docs/images/mag_logo_mascot_light.png new file mode 100644 index 00000000..ecb17340 Binary files /dev/null and b/docs/images/mag_logo_mascot_light.png differ diff --git a/docs/images/mag_logo_mascot_light.svg b/docs/images/mag_logo_mascot_light.svg new file mode 100644 index 00000000..0e42229d --- /dev/null +++ b/docs/images/mag_logo_mascot_light.svg @@ -0,0 +1,331 @@ + +image/svg+xml + + +nf-core/mag diff --git a/docs/images/mag_metromap_simple_dark.png b/docs/images/mag_metromap_simple_dark.png new file mode 100644 index 00000000..ee73c55a Binary files /dev/null and b/docs/images/mag_metromap_simple_dark.png differ diff --git a/docs/images/mag_metromap_simple_dark.svg b/docs/images/mag_metromap_simple_dark.svg new file mode 100644 index 00000000..5eccb751 --- /dev/null +++ b/docs/images/mag_metromap_simple_dark.svg @@ -0,0 +1,2047 @@ + + + +Nanoplot(Raw)Nanolyse /chopperKraken2 /CentrifugeFiltlong /NanoqchopperNanoplot(Filtered)porechoporporechop_ABIFastQC(Trimmed)FastQC(Raw)Bowtie2 (Host)Run merge Bowtie2(PhiX)BBNormKronaAdapterRemovalorfastp + + +nf-core/magMEGAHIT /SPAdesProdigalQUASTgeNomadpyDamageFreeBayesBCFToolsDASToolMetaBat2 /MaxBin2 / CONCOCTTiaraProkkaMetaEukGUNCMultiQCBin Summary34567fastqfastqfastqfastq1083Short readclassification4Assembly5Ancient DNA6Binning7Bin refinement8Annotation9Contigclassification10Reportgeneration1Short readpreprocessing2Long readpreprocessing12Bowtie2 (Contigs)GTDBTKCATQUAST9Short readsLong readsAssembled contigsGenome binsMultipleparallelchoicesSingle choiceDEFAULTOPT-INBUSCO /CheckM /CheckM2v3.3 diff --git a/docs/images/mag_metromap_simple_light.png b/docs/images/mag_metromap_simple_light.png new file mode 100644 index 00000000..1ee20624 Binary files /dev/null and b/docs/images/mag_metromap_simple_light.png differ diff --git a/docs/images/mag_metromap_simple_light.svg b/docs/images/mag_metromap_simple_light.svg new file mode 100644 index 00000000..3b4011ce --- /dev/null +++ b/docs/images/mag_metromap_simple_light.svg @@ -0,0 +1,2050 @@ + + + +Kraken2 /CentrifugeNanoplot(Raw)Nanolyse /chopperFiltlong /NanoqchopperNanoplot(Filtered)porechoporporechop_ABIFastQC(Trimmed)FastQC(Raw)Bowtie2 (Host)Run merge Bowtie2(PhiX)BBNormKronav3.3AdapterRemovalorfastp + + +nf-core/magMEGAHIT /SPAdesProdigalQUASTgeNomadpyDamageFreeBayesBCFToolsDASToolMetaBat2 /MaxBin2 /CONCOCTTiaraProkkaMetaEukGUNCMultiQCBin Summary34567fastqfastqfastqfastq1083Short readclassification4Assembly5Ancient DNA6Binning7Bin refinement8Annotation9Contigclassification10Reportgeneration1Short readpreprocessing2Long readpreprocessing12Bowtie2 (Contigs)BUSCO /CheckM /CheckM2GTDBTKCATQUAST9Short readsLong readsAssembled contigsGenome binsMultipleparalleloptionsSingle choiceDEFAULTOPT-IN diff --git a/docs/images/mag_workflow.png b/docs/images/mag_workflow.png new file mode 100644 index 00000000..4969e68c Binary files /dev/null and b/docs/images/mag_workflow.png differ diff --git a/docs/images/mag_workflow.svg b/docs/images/mag_workflow.svg new file mode 100644 index 00000000..38171379 --- /dev/null +++ b/docs/images/mag_workflow.svg @@ -0,0 +1,2711 @@ + +image/svg+xmlTaxonomicclassificationCentrifugeKraken2VisualizationKronaDomain classificationDBShort reads(required)Adapter/qualitytrimmingBBNormfastpAdapterRemovalHost read removalDepth normalisationBowtie2Remove PhiXBowtie2FastQCEvaluation csvLong reads(optional)FiltlongQuality filteringNanoPlotEvaluationNanoLyseRemove LambdaNanoqchopperchopperFiltlongQuality filteringAdapter/qualitytrimmingPorechopporechop_ABIReportingMultiQC(MAG summary)tsvDBporechop_ABIDBTaxonomic classificationCATGTDB-TkTiaraMetaEukGenome annotationPROKKAProtein-codinggene predictionPRODIGALVirus identificationAssembly(sample- or group-wise)EvaluationQUASTaDNA ValidationpyDamageFreebayesBCFToolsgeNomadSPAdesMEGAHITSPAdesHybridBinningMetaBAT2MaxBin2CONCOCTEvaluationBUSCOCheckMCheckM2GUNCQUAST(Abundance estimation and visualization)v3.3Binning refinementDAS Tool + + +nf-core/magCC-BY 4.0 Design originally by Zandra FagernäsBin post-processing diff --git a/docs/images/mag_workflow_dark.png b/docs/images/mag_workflow_dark.png new file mode 100644 index 00000000..c9226ea6 Binary files /dev/null and b/docs/images/mag_workflow_dark.png differ diff --git a/docs/images/mag_workflow_dark.svg b/docs/images/mag_workflow_dark.svg new file mode 100644 index 00000000..105893cb --- /dev/null +++ b/docs/images/mag_workflow_dark.svg @@ -0,0 +1,2653 @@ + + + + +image/svg+xmlTaxonomicclassificationCentrifugeKraken2VisualizationKronaDomain classificationReportingMultiQC(MAG summary)tsvDBShort reads(required)Adapter/qualitytrimmingBBNormfastpAdapterRemovalHost read removalDepth normalisationBowtie2Remove PhiXBowtie2FastQCEvaluation csvLong reads(optional)NanoPlotEvaluationNanoLyseRemove LambdaNanoqchopperchopperFiltlongQuality filteringAdapter/qualitytrimmingPorechopporechop_ABIDBTaxonomic classificationCATGTDB-TkTiaraMetaEukGenome annotationPROKKAProtein-codinggene predictionPRODIGALVirus identificationAssembly(sample- or group-wise)EvaluationQUASTaDNA ValidationpyDamageFreebayesBCFToolsgeNomadSPAdesMEGAHITSPAdesHybridDBBinningMetaBAT2MaxBin2CONCOCTEvaluationBUSCOCheckMCheckM2GUNCQUAST(Abundance estimation and visualization)v3.3Binning refinementDAS Toolnf-core/magCC-BY 4.0 Design originally by Zandra FagernäsBin post-processing diff --git a/docs/output.md b/docs/output.md index ff36398b..daece6f6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,27 +6,761 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- [Quality control](#quality-control) of input reads - trimming and contaminant removal +- [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) +- [Digital sequencing normalisation](#digital-normalization-with-BBnorm) +- [Assembly](#assembly) of trimmed reads +- [Protein-coding gene prediction](#gene-prediction) of assemblies +- [Virus identification](#virus-identification-in-assemblies) of assemblies +- [Binning and binning refinement](#binning-and-binning-refinement) of assembled contigs +- [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes) +- [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) +- [Additional summary for binned genomes](#additional-summary-for-binned-genomes) +- [Ancient DNA](#ancient-dna) +- [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +Note that when specifying the parameter `--coassemble_group`, for the corresponding output filenames/directories of the assembly or downsteam processes the group ID, or more precisely the term `group-[group_id]`, will be used instead of the sample ID. + +## Quality control + +These steps trim away the adapter sequences present in input reads, trims away bad quality bases and sicard reads that are too short. +It also removes host contaminants and sequencing controls, such as PhiX or the Lambda phage. +FastQC is run for visualising the general quality metrics of the sequencing runs before and after trimming. + + + ### FastQC
    Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `QC_shortreads/fastqc/` + - `[sample]_[1/2]_fastqc.html`: FastQC report, containing quality metrics for your untrimmed raw fastq files + - `[sample].trimmed_[1/2]_fastqc.html`: FastQC report, containing quality metrics for trimmed and, if specified, filtered read files + +
    + +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +### fastp + +[fastp](https://github.com/OpenGene/fastp) is a all-in-one fastq preprocessor for read/adapter trimming and quality control. It is used in this pipeline for trimming adapter sequences and discard low-quality reads. Its output is in the results folder and part of the MultiQC report. + +
    +Output files + +- `QC_shortreads/fastp/[sample]/` + - `fastp.html`: Interactive report + - `fastp.json`: Report in json format + +
    + +### AdapterRemoval2 + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
    +Output files + +- `QC_shortreads/adapterremoval/[sample]/` + - `[sample]_ar2.settings`: AdapterRemoval log file. + +
    + +### Remove PhiX sequences from short reads + +The pipeline uses bowtie2 to map the reads against PhiX and removes mapped reads. + +
    +Output files + +- `QC_shortreads/remove_phix/` + - `[sample].phix_removed.bowtie2.log`: Contains a brief log file indicating how many reads have been retained. + +
    + +### Host read removal + +The pipeline uses bowtie2 to map short reads against the host reference genome specified with `--host_genome` or `--host_fasta` and removes mapped reads. The information about discarded and retained reads is also included in the MultiQC report. + +
    +Output files + +- `QC_shortreads/remove_host/` + - `[sample].host_removed.bowtie2.log`: Contains the bowtie2 log file indicating how many reads have been mapped. + - `[sample].host_removed.mapped*.read_ids.txt`: Contains a file listing the read ids of discarded reads. + +
    + +### Remove Phage Lambda sequences from long reads + +The pipeline uses Nanolyse to map the reads against the Lambda phage and removes mapped reads. + +
    +Output files + +- `QC_longreads/NanoLyse/` + - `[sample]_[run]_lambdafiltered.nanolyse.log`: Contains a brief log file indicating how many reads have been removed. + +
    + +### Long read adapter removal + +The pipeline uses porechop_abi or porechop to perform adapter trimming of the long reads that are eventually provided with the TSV input file. + +
    +Output files + +- `QC_longreads/porechop/` + - `[sample]_[run]_porechop_trimmed.fastq.gz`: If `--longread_adaptertrimming_tool 'porechop'`, the adapter trimmed FASTQ files from porechop + - `[sample]_[run]_porechop-abi_trimmed.fastq.gz`: If `--longread_adaptertrimming_tool 'porechop_abi'`, the adapter trimmed FASTQ files from porechop_ABI + +
    + +### Long read filtering + +The pipeline uses filtlong, chopper, or nanoq for quality filtering of long reads, specified with `--longread_filtering_tool `. Only filtlong is capable of filtering long reads against short reads, and is therefore currently recommended in the hybrid mode. If chopper is selected as long read filtering tool, Lambda Phage removal will be performed with chopper as well, instead of nanolyse. + +
    +Output files + +- `QC_longreads/Filtlong/` + - `[sample]_[run]_filtlong.fastq.gz`: The length and quality filtered reads in FASTQ from Filtlong +- `QC_longreads/Nanoq/` + - `[sample]_[run]_nanoq_filtered.fastq.gz`: The length and quality filtered reads in FASTQ from Nanoq +- `QC_longreads/Chopper/` + - `[sample]_[run]_nanoq_chopper.fastq.gz`: The length and quality filtered, optionally phage lambda removed reads in FASTQ from Chopper + +
    + +Trimmed and filtered FASTQ output directories and files will only exist if `--save_porechop_reads` and/or `--save_filtered_longreads` (respectively) are provided to the run command. + +No direct host read removal is performed for long reads. +However, since within this pipeline filtlong uses a read quality based on k-mer matches to the already filtered short reads, reads not overlapping those short reads might be discarded. Note that this only applies when using filtlong as long read filtering tool. +The lower the parameter `--longreads_length_weight`, the higher the impact of the read qualities for filtering. +For further documentation see the [filtlong online documentation](https://github.com/rrwick/Filtlong). + +### Quality visualisation for long reads + +NanoPlot is used to calculate various metrics and plots about the quality and length distribution of long reads. For more information about NanoPlot see the [online documentation](https://github.com/wdecoster/NanoPlot). + +
    +Output files + +- `QC_longreads/NanoPlot/[sample]/` + - `raw_*.[png/html/txt]`: Plots and reports for raw data + - `filtered_*.[png/html/txt]`: Plots and reports for filtered data + +
    + +## Digital normalization with BBnorm + +If the pipeline is called with the `--bbnorm` option, it will normalize sequencing depth of libraries prior assembly by removing reads to 1) reduce coverage of very abundant kmers and 2) delete very rare kmers (see `--bbnorm_target` and `--bbnorm_min` parameters). +When called in conjunction with `--coassemble_group`, BBnorm will operate on interleaved (merged) FastQ files, producing only a single output file. +If the `--save_bbnorm_reads` parameter is set, the resulting FastQ files are saved together with log output. + +
    +Output files + +- `bbmap/bbnorm/[sample]\*.fastq.gz` +- `bbmap/bbnorm/log/[sample].bbnorm.log` + +
    + +## Taxonomic classification of trimmed reads + +### Kraken + +Kraken2 classifies reads using a k-mer based approach as well as assigns taxonomy using a Lowest Common Ancestor (LCA) algorithm. + +
    +Output files + +- `Taxonomy/kraken2/[sample]/` + - `kraken2.report`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details + - `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) + +
    + +### Centrifuge + +Centrifuge is commonly used for the classification of DNA sequences from microbial samples. It uses an indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. + +More information on the [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) website + +
    +Output files + +- `Taxonomy/centrifuge/[sample]/` + - `[sample].kreport.txt`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details + - `[sample].report.txt`: Tab-delimited result file. See the [centrifuge manual](https://ccb.jhu.edu/software/centrifuge/manual.shtml#centrifuge-classification-output) for information about the fields + - `[sample].results.txt`: Per read taxonomic classification information. See the [centrifuge manual](https://ccb.jhu.edu/software/centrifuge/manual.shtml#centrifuge-classification-output) for more details + - `[sample].html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) + +
    + +## Assembly + +Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembly is only supported by SPAdes. + +### MEGAHIT + +[MEGAHIT](https://github.com/voutcn/megahit) is a single node assembler for large and complex metagenomics short reads. + +
    +Output files + +- `Assembly/MEGAHIT/` + - `[sample/group].contigs.fa.gz`: Compressed metagenome assembly in fasta format + - `[sample/group].log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. + +
    + +### SPAdes + +[SPAdes](http://cab.spbu.ru/software/spades/) was originally a single genome assembler that later added support for assembling metagenomes. + +
    +Output files + +- `Assembly/SPAdes/` + - `[sample/group].scaffolds.fa.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group].assembly.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group].contigs.fa.gz`: Compressed assembled contigs in fasta format + - `[sample/group].spades.log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdes-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. + +
    + +### SPAdesHybrid + +SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) software and is used when the user provides both long and short reads. + +
    +Output files + +- `Assembly/SPAdesHybrid/` + - `[sample/group].scaffolds.fa.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group].assembly.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group].contigs.fa.gz`: Compressed assembled contigs in fasta format + - `[sample/group].spades.log`: Log file + - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs + - `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. + - `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). + - `SPAdesHybrid-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. + +
    + +### Metagenome QC with QUAST + +[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates metagenome assemblies by computing various metrics. The QUAST output is also included in the MultiQC report, as well as in the assembly directories themselves. + +
    +Output files + +- `Assembly/[assembler]/QC/[sample/group]/QUAST/` + - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt + - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) + - `quast.log`: QUAST log file + - `metaquast.log`: MetaQUAST log file + - `icarus.html`: Icarus main menu with links to interactive viewers + - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest + - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) + - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs + - `basic_stats/[assembler]-[sample/group]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs + - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. + - `predicted_genes/[assembler]-[sample/group].rna.gff`: Contig positions for rRNA genes in gff version 3 format + - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) + +
    + +## Gene prediction + +Protein-coding genes are predicted for each assembly. + +
    +Output files + +- `Annotation/Prodigal/` + - `[assembler]-[sample/group].gff.gz`: Gene Coordinates in GFF format + - `[assembler]-[sample/group].faa.gz`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. + - `[assembler]-[sample/group].fna.gz`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). + - `[assembler]-[sample/group]_all.txt.gz`: Information about start positions of genes. + +
    + +## Virus identification in assemblies + +### geNomad + +[geNomad](https://github.com/apcamargo/genomad) identifies viruses and plasmids in sequencing data (isolates, metagenomes, and metatranscriptomes) + +
    +Output files + +- `VirusIdentification/geNomad/[assembler]-[sample/group]*/` + - `[assembler]-[sample/group]*_annotate` + - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data + - `[assembler]-[sample/group]*_aggregated_classification` + - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format + - `[assembler]-[sample/group]*_find_proviruses` + - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad + - `[assembler]-[sample/group]*_summary` + - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format + - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format + - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format + - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format + - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format + - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format + - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format + - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary) + +
    + +## Binning and binning refinement + +### Contig sequencing depth + +Sequencing depth per contig and sample is generated by MetaBAT2's `jgi_summarize_bam_contig_depths --outputDepth`. The values correspond to `(sum of exactly aligned bases) / ((contig length)-2*75)`. For example, for two reads aligned exactly with `10` and `9` bases on a 1000 bp long contig the depth is calculated by `(10+9)/(1000-2*75)` (1000bp length of contig minus 75bp from each end, which is excluded). + +These depth files are used for downstream binning steps. + +
    +Output files + +- `GenomeBinning/depths/contigs/` + - `[assembler]-[sample/group]-depth.txt.gz`: Sequencing depth for each contig and sample or group, only for short reads. + +
    + +### MetaBAT2 + +[MetaBAT2](https://bitbucket.org/berkeleylab/metabat) recovers genome bins (that is, contigs/scaffolds that all belongs to a same organism) from metagenome assemblies. + +
    +Output files + +- `GenomeBinning/MetaBAT2/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `unbinned/[assembler]-[binner]-[sample/group].unbinned.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported + +
    + +All the files and contigs in these folders will be assessed by QUAST and BUSCO. + +All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here. + +
    +Output files + +- `GenomeBinning/MetaBAT2/discarded/` + - `*.lowDepth.fa.gz`: Low depth contigs that are filtered by MetaBAT2 + - `*.tooShort.fa.gz`: Too short contigs that are filtered by MetaBAT2 +- `GenomeBinning/MetaBAT2/unbinned/discarded/` + - `*.unbinned.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. + - `*.unbinned.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. + +
    + +All the files in this folder contain small and/or unbinned contigs that are not further processed. + +Files in these two folders contain all contigs of an assembly. + +### MaxBin2 + +[MaxBin2](https://sourceforge.net/projects/maxbin2/) recovers genome bins (that is, contigs/scaffolds that all belongs to a same organism) from metagenome assemblies. + +
    +Output files + +- `GenomeBinning/MaxBin2/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `unbinned/[assembler]-[binner]-[sample/group].noclass.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported. + +
    + +All the files and contigs in these folders will be assessed by QUAST and BUSCO. + +
    +Output files + +- `GenomeBinning/MaxBin2/discarded/` + - `*.tooshort.gz`: Too short contigs that are filtered by MaxBin2 +- `GenomeBinning/MaxBin2/unbinned/discarded/` + - `*.noclass.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. + - `*.noclass.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. + +
    + +All the files in this folder contain small and/or unbinned contigs that are not further processed. + +Files in these two folders contain all contigs of an assembly. + +### CONCOCT + +[CONCOCT](https://github.com/BinPro/CONCOCT) performs unsupervised binning of metagenomic contigs by using nucleotide composition, coverage data in multiple samples and linkage data from paired end reads. + +
    +Output files + +- `GenomeBinning/CONCOCT/` + - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly + - `stats/[assembler]-[binner]-[sample/group].csv`: Table indicating which contig goes with which cluster bin. + - `stats/[assembler]-[binner]-[sample/group]*_gt1000.csv`: Various intermediate PCA statistics used for clustering. + - `stats/[assembler]-[binner]-[sample/group]_*.tsv`: Coverage statistics of each sub-contig cut up by CONOCOCT prior in an intermediate step prior to binning. Likely not useful in most cases. + - `stats/[assembler]-[binner]-[sample/group].log.txt`: CONCOCT execution log file. + - `stats/[assembler]-[binner]-[sample/group]_*.args`: List of arguments used in CONCOCT execution. + -
    + +All the files and contigs in these folders will be assessed by QUAST and BUSCO, if the parameter `--postbinning_input` is not set to `refined_bins_only`. + +Note that CONCOCT does not output what it considers 'unbinned' contigs, therefore no 'discarded' contigs are produced here. You may still need to do your own manual curation of the resulting bins. + +### DAS Tool + +[DAS Tool](https://github.com/cmks/DAS_Tool) is an automated binning refinement method that integrates the results of a flexible number of binning algorithms to calculate an optimized, non-redundant set of bins from a single assembly. nf-core/mag uses this tool to attempt to further improve bins based on combining the MetaBAT2 and MaxBin2 binning output, assuming sufficient quality is met for those bins. + +DAS Tool will remove contigs from bins that do not pass additional filtering criteria, and will discard redundant lower-quality output from binners that represent the same estimated 'organism', until the single highest quality bin is represented. + +> ⚠️ If DAS Tool does not find any bins passing your selected threshold it will exit with an error. Such an error is 'ignored' by nf-core/mag, therefore you will not find files in the `GenomeBinning/DASTool/` results directory for that particular sample. + +
    +Output files + +- `GenomeBinning/DASTool/` + - `[assembler]-[sample/group]_allBins.eval`: Tab-delimited description with quality and completeness metrics for the input bin sets. Quality and completeness are estimated by DAS TOOL using a scoring function based on the frequency of bacterial or archaeal reference single-copy genes (SCG). Please see note at the bottom of this section on file names. + - `[assembler]-[sample/group]_DASTool_summary.tsv`: Tab-delimited description with quality and completeness metrics for the refined output bin sets. + - `[assembler]-[sample/group]_DASTool_contig2bin.tsv`: File describing which contig is associated to which bin from the input binners. + - `[assembler]-[sample/group]_DASTool.log`: Log file from the DAS Tool run describing the command executed and additional runtime information. + - `[assembler]-[sample/group].seqlength`: Tab-delimited file describing the length of each contig. + - `bins/[assembler]-[binner]Refined-[sample/group].*.fa`: Refined bins in fasta format. + - `unbinned/[assembler]-DASToolUnbinned-[sample/group].*.fa`: Unbinned contigs from bin refinement in fasta format. + +
    + +By default, only the raw bins (and unbinned contigs) from the actual binning methods, but not from the binning refinement with DAS Tool, will be used for downstream bin quality control, annotation and taxonomic classification. The parameter `--postbinning_input` can be used to change this behaviour. + +⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. + +### Tiara + +Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. + +
    +Output files + +- `Taxonomy/Tiara/` + - `[assembler]-[sample/group].tiara.txt` - Tiara output classifications (with probabilities) for all contigs within the specified sample/group assembly + - `log_[assembler]-[sample/group].txt` - log file detailing the parameters used by the Tiara model for contig classification. +- `GenomeBinning/tiara_summary.tsv` - Summary of Tiara domain classification for all bins. + +
    + +Typically, you would use `tiara_summary.tsv` as the primary file to see which bins or unbins have been classified to which domains at a glance, whereas the files in `Taxonomy/Tiara` provide classifications for each contig. + +### Bin sequencing depth + +For each bin or refined bin the median sequencing depth is computed based on the corresponding contig depths. + +
    +Output files + +- `GenomeBinning/depths/bins/` + - `bin_depths_summary.tsv`: Summary of bin sequencing depths for all samples. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. + - `bin_refined_depths_summary.tsv`: Summary of sequencing depths for refined bins for all samples, if refinement was performed. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. + - `[assembler]-[binner]-[sample/group]-binDepths.heatmap.png`: Clustered heatmap showing bin abundances of the assembly across samples. Bin depths are transformed to centered log-ratios and bins as well as samples are clustered by Euclidean distance. Again, sample depths are available according to the mapping strategy specified with `--binning_map_mode`. If a sample produces only a single bin, a heatmap will not be provided. + +
    + +### QC for metagenome assembled genomes with QUAST + +[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates genome assemblies by computing various metrics. The QUAST output is in the bin directories shown below. This QUAST output is not shown in the MultiQC report. + +
    +Output files + +- `GenomeBinning/QC/QUAST/[assembler]-[bin]/` + - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt + - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) + - `quast.log`: QUAST log file + - `metaquast.log`: MetaQUAST log file + - `icarus.html`: Icarus main menu with links to interactive viewers + - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest + - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) + - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs + - `basic_stats/[assembler]-[bin]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs + - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. + - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format + - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) +- `GenomeBinning/QC/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition. + - `quast_summary.tsv`: QUAST output for all bins summarized + +
    + +### QC for metagenome assembled genomes + +#### BUSCO + +[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_db`, only results for this specific lineage will be generated. + +
    +Output files + +- `GenomeBinning/QC/BUSCO/` + - `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. + - `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. + - `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. + - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_db`. + - `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). + - `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). + - `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal. + +
    + +If the parameter `--save_busco_db` is set, additionally the used BUSCO lineage datasets are stored in the output directory. + +
    +Output files + +- `GenomeBinning/QC/BUSCO/` + - `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). + - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_db`.
    -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).### MultiQC +Besides the reference files or output files created by BUSCO, the following summary files will be generated: + +
    +Output files + +- `GenomeBinning/QC/` + - `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available. + +
    + +#### CheckM + +[CheckM](https://ecogenomics.github.io/CheckM/) provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage + +By default, nf-core/mag runs CheckM with the `check_lineage` workflow that places genome bins on a reference tree to define lineage-marker sets, to check for completeness and contamination based on lineage-specific marker genes. and then subsequently runs `qa` to generate the summary files. + +
    +Output files + +- `GenomeBinning/QC/CheckM/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. +- `GenomeBinning/QC/` + - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). + +
    + +If the parameter `--save_checkm_reference` is set, additionally the used the CheckM reference datasets are stored in the output directory. + +
    +Output files + +- `GenomeBinning/QC/CheckM/` + - `checkm_downloads/`: All CheckM reference files downloaded from the CheckM FTP server, when not supplied by the user. + - `checkm_data_2015_01_16/*`: a range of directories and files required for CheckM to run. + +
    + +#### CheckM2 + +[CheckM2](https://github.com/chklovski/CheckM2) is a tool for assessing the quality of metagenome-derived genomes. It uses a machine learning approach to predict the completeness and contamination of a genome regardless of its taxonomic lineage. + +
    +Output files + +- `GenomeBinning/QC/CheckM2/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/quality_report.tsv`: Detailed statistics about bins informing completeness and contamamination scores. This should normally be your main file to use to evaluate your results. + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: Intermediate files for CheckM2 results, including CheckM2 generated annotations, log, and DIAMOND alignment results. +- `GenomeBinning/QC/` + - `checkm2_summary.tsv`: A summary table of the CheckM2 results for all bins. + +
    + +If the parameter `--save_checkm2_data` is set, the CheckM2 reference datasets will be stored in the output directory. + +
    +Output files + +- `GenomeBinning/QC/CheckM2/` + - `checkm2_downloads/CheckM2_database/*.dmnd`: Diamond database used by CheckM2. + +
    + +#### GUNC + +[Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. + +
    +Output files + +- `GenomeBinning/QC/gunc_summary.tsv` +- `GenomeBinning/QC/gunc_checkm_summary.tsv` +- `[gunc-database].dmnd` +- `GUNC/` + - `raw/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/[fasta input file name]/GUNC_checkM.merged.tsv`: Per sample GUNC [output](https://grp-bork.embl-community.io/gunc/output.html) containing with taxonomic and completeness QC statistics. + - `checkmmerged/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/[checkm input file name]/GUNC.progenomes_2.1.maxCSS_level.tsv`: Per sample GUNC output merged with output from [CheckM](#checkm) + +
    + +GUNC will be run if specified with `--run_gunc` as a standalone, unless CheckM is also activated via `--qc_tool 'checkm'`, in which case GUNC output will be merged with the CheckM output using `gunc merge_checkm`. + +If `--gunc_save_db` is specified, the output directory will also contain the requested database (progenomes, or GTDB) in DIAMOND format. + +## Taxonomic classification of binned genomes + +### CAT + +[CAT](https://github.com/dutilh/CAT) is a toolkit for annotating contigs and bins from metagenome-assembled-genomes. The nf-core/mag pipeline uses CAT to assign taxonomy to genome bins based on the taxnomy of the contigs. + +
    +Output files + +- `Taxonomy/CAT/[assembler]/[binner]/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names +- `Taxonomy/CAT/[assembler]/[binner]/raw/` + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins + - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files + +
    + +If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally the generated CAT database is stored: + +
    +Output files + +- `Taxonomy/CAT/CAT_prepare_*.tar.gz`: Generated and used CAT database. + +
    + +### GTDB-Tk + +[GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) is a toolkit for assigning taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). nf-core/mag uses GTDB-Tk to classify binned genomes which satisfy certain quality criteria (i.e. completeness and contamination assessed with the BUSCO analysis). + +
    +Output files + +- `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/` + - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)). + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files. + - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. +- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). + +
    + +## Genome annotation of binned genomes + +### Prokka + +Whole genome annotation is the process of identifying features of interest in a set of genomic DNA sequences, and labelling them with useful information. [Prokka](https://github.com/tseemann/prokka) is a software tool to annotate bacterial, archaeal and viral genomes quickly and produce standards-compliant output files. + +
    +Output files + +- `Annotation/Prokka/[assembler]/[bin]/` + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].gbk`: annotation in GenBank format, containing both sequences and annotations + - `[assembler]-[binner]-[bin].fna`: nucleotide FASTA file of the input contig sequences + - `[assembler]-[binner]-[bin].faa`: protein FASTA file of the translated CDS sequences + - `[assembler]-[binner]-[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) + - `[assembler]-[binner]-[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank + - `[assembler]-[binner]-[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file + - `[assembler]-[binner]-[bin].err`: unacceptable annotations - the NCBI discrepancy report. + - `[assembler]-[binner]-[bin].log`: contains all the output that Prokka produced during its run + - `[assembler]-[binner]-[bin].txt`: statistics relating to the annotated features found + - `[assembler]-[binner]-[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) + +
    + +### MetaEuk + +In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://github.com/soedinglab/metaeuk) is also available to annotate eukaryotic genomes quickly with standards-compliant output files. + +
    +Output files + +- `Annotation/MetaEuk/[assembler]/[bin]` + - `[assembler]-[binner]-[bin].fas`: fasta file of protein sequences identified by MetaEuk + - `[assembler]-[binner]-[bin].codon.fas`: fasta file of nucleotide sequences corresponding to the protein sequences fasta + - `[assembler]-[binner]-[bin].headersMap.tsv`: tab-separated table containing the information from each header in the fasta files + - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format + +
    + +## Additional summary for binned genomes + +
    +Output files + +- `GenomeBinning/bin_summary.tsv`: Summary of bin sequencing depths together with BUSCO, CheckM, QUAST and GTDB-Tk results, if at least one of the later was generated. This will also include refined bins if `--refine_bins_dastool` binning refinement is performed. Note that in contrast to the other tools, for CheckM the bin name given in the column "Bin Id" does not contain the ".fa" extension. + +
    + +## Ancient DNA + +Optional, only running when parameter `-profile ancient_dna` is specified. + +### `PyDamage` + +[Pydamage](https://github.com/maxibor/pydamage), is a tool to automate the process of ancient DNA damage identification and estimation from contigs. After modelling the ancient DNA damage using the C to T transitions, Pydamage uses a likelihood ratio test to discriminate between truly ancient, and modern contigs originating from sample contamination. + +
    +Output files + +- `Ancient_DNA/pydamage/analyze` + - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage raw result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) +- `Ancient_DNA/pydamage/filter` + - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage filtered result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) + +
    + +### `variant_calling` + +Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied. + +
    +Output files + +- `variant_calling/consensus` + - `[assembler]_[sample/group].fa`: contigs sequence with re-called consensus from read-to-contig alignment +- `variant_calling/unfiltered` + - `[assembler]_[sample/group].vcf.gz`: raw variant calls of the reads aligned back to the contigs. +- `variant_calling/filtered` + - `[assembler]_[sample/group].filtered.vcf.gz`: quality filtered variant calls of the reads aligned back to the contigs. + +
    + +### MultiQC
    Output files @@ -40,7 +774,24 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .### Pipeline information +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + +The general stats table at the top of the table will by default only display the most relevant pre- and post-processing statistics prior to assembly, i.e., FastQC, fastp/Adapter removal, and Bowtie2 PhiX and host removal mapping results. + +Note that the FastQC raw and processed columns are right next to each other for improved visual comparability, however the processed columns represent the input reads _after_ fastp/Adapter Removal processing (the dedicated columns of which come directly after the two FastQC set of columns). Hover your cursor over each column name to see the which tool the column is derived from. + +Summary tool-specific plots and tables of following tools are currently displayed (if activated): + +- FastQC (pre- and post-trimming) +- fastp +- Adapter Removal +- bowtie2 +- BUSCO +- QUAST +- Kraken2 / Centrifuge +- PROKKA + +### Pipeline information
    Output files @@ -48,7 +799,6 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `pipeline_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`.
    diff --git a/docs/usage.md b/docs/usage.md index 3d3de1c6..cdddf0e0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,60 +4,105 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction +## Input specifications - +The input data can be passed to nf-core/mag in two possible ways, either using the `--input` parameter of raw-reads alone or `--input` additionally with `--assembly_input` that specifies pre-built assemblies. -## Samplesheet input +### Samplesheet input file -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You can specify a CSV samplesheet input file that contains the paths to your FASTQ files and additional metadata. Furthermore when a `run` column is present, the pipeline will also run perform run- or lane-wise concatenation, for cases where you may have a sample or library sequenced with the same sequencing configuration across multiple runs. The optional run merging happens after short read QC (adapter clipping, host/PhiX removal etc.), and prior to normalisation, taxonomic profiling, and assembly. -```bash ---input '[path to samplesheet file]' +At a minimum CSV file should contain the following columns: + +`sample,group,short_reads_1,short_reads_2,long_reads` + +The path to `long_reads` and `short_reads_2` is optional. Valid examples could look like the following: + +```csv title="samplesheet.csv" +sample,group,short_reads_1,short_reads_2,long_reads +sample1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz +sample2,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz +sample3,1,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz, ``` -### Multiple runs of the same sample +or -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +```csv title="samplesheet.csv" +sample,group,short_reads_1,short_reads_2,long_reads +sample1,0,data/sample1.fastq.gz,, +sample2,0,data/sample2.fastq.gz,, +``` + +or to additionally to perform run merging of two runs of sample1: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run,group,short_reads_1,short_reads_2,long_reads +sample1,1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz +sample1,2,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz,data/sample1.fastq.gz +sample2,0,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz,data/sample2.fastq.gz +sample3,1,0,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz, ``` -### Full samplesheet +Please note the following requirements: + +- a minimum 5 of comma-separated columns +- Valid file extension: `.csv` +- Must contain the header `sample,group,short_reads_1,short_reads_2,long_reads` (where `run` can be optionally added) +- Run IDs must be unique within a multi-run sample. A sample with multiple runs will be automatically concatenated. +- FastQ files must be compressed (`.fastq.gz`, `.fq.gz`) +- `long_reads` can only be provided in combination with paired-end short read data +- Within one samplesheet either only single-end or only paired-end reads can be specified +- If single-end reads are specified, the command line parameter `--single_end` must be specified as well + +Again, by default, the group information is only used to compute co-abundances for the binning step, but not for group-wise co-assembly (see the parameter docs for [`--coassemble_group`](https://nf-co.re/mag/parameters#coassemble_group) and [`--binning_map_mode`](https://nf-co.re/mag/parameters#binning_map_mode) for more information about how this group information can be used). + +### Supplying pre-computed assemblies + +It is also possible to run nf-core/mag on pre-computed assemblies, by supplying a CSV file to the parameter `--assembly_input` in addition to the raw reads supplied to `--input`. Supplying assembly input skips all read pre-processing and assembly, jumping straight to the binning stage of the pipeline. + +The assembly CSV file should contain the following columns: -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +`id,group,assembler,fasta` -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +Where `id` is the ID of the assembly, group is the assembly/binning group (see samplesheet information section for more details), `assembler` is the assembler used to produce the assembly (one of `MEGAHIT`, `SPAdes`, or `SPAdesHybrid`), and `fasta` is the path to the assembly fasta file. Input fasta files can be compressed or uncompressed, but compressed assemblies will be automatically uncompressed for use within the pipeline. The exact information required for each supplied assembly depends on whether the assemblies provided are single assemblies or group-wise co-assemblies. For the following example `--input` CSV: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +sample,group,short_reads_1,short_reads_2,long_reads +sample1,0,data/sample1_R1.fastq.gz,data/sample1_R2.fastq.gz, +sample2,0,data/sample2_R1.fastq.gz,data/sample2_R2.fastq.gz, +sample3,1,data/sample3_R1.fastq.gz,data/sample3_R2.fastq.gz, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +If the assemblies are single assemblies, then the `id` and `group` columns should match those supplied in the `-input` read CSV files for each read set: -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +```csv title="samplesheet.csv" +id,group,assembler,fasta +sample1,0,MEGAHIT,MEGAHIT-sample1.contigs.fa.gz +sample1,0,SPAdes,SPAdes-sample1.fasta.gz +sample2,0,MEGAHIT,MEGAHIT-sample2.contigs.fa.gz +sample2,0,SPAdes,SPAdes-sample2.contigs.fasta.gz +sample3,1,MEGAHIT,MEGAHIT-sample3.contigs.fa.gz +sample3,1,SPAdes,SPAdes-sample3.contigs.fasta.gz +``` + +If the assemblies are co-assemblies, the parameter `--coassemble_group` should additionally be specified. In this case, the `id` column should uniquely identify the assembly, while `group` should match those specified in the `--input` CSV file: + +```csv title="samplesheet.csv" +id,group,assembler,fasta +group-0,0,MEGAHIT,MEGAHIT-group-0.contigs.fa.gz +group-0,0,SPAdes,SPAdes-group-0.contigs.fasta.gz +group-1,1,MEGAHIT,MEGAHIT-group-1.contigs.fa.gz +group-1,1,SPAdes,SPAdes-group-1.contigs.fasta.gz +``` + +When supplying pre-computed assemblies, reads **must** also be provided in the CSV input format to `--input`, and should be the reads used to build the assemblies, i.e., adapter-removed, run-merged etc.. Preprocessing steps will not be ran on raw reads when pre-computed assemblies are supplied. As long reads are only used for assembly, any long read fastq files listed in the reads CSV are ignored. ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/mag --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/mag --input samplesheet.csv --outdir -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -89,12 +134,13 @@ with: ```yaml title="params.yaml" input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +See the [nf-core/mag website documentation](https://nf-co.re/mag/parameters) for more information about pipeline specific parameters. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -116,6 +162,19 @@ To further assist in reproducibility, you can use share and reuse [parameter fil > [!TIP] > If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +Additionally, to enable also reproducible results from the individual assembly tools this pipeline provides extra parameters. SPAdes is designed to be deterministic for a given number of threads. To generate reproducible results set the number of cpus with `--spades_fix_cpus` or `--spadeshybrid_fix_cpus`. This will overwrite the number of cpus specified in the `base.config` file and additionally ensure that it is not increased in case of retries for individual samples. MEGAHIT only generates reproducible results when run single-threaded. +You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do not specify the number of cpus for these processes in additional custom config files, this would result in an error. + +MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results. + +To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_db` and use the parameter `--save_busco_db`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible. + +For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved. + +When it comes to visualizing taxonomic data using [Krona](https://github.com/marbl/Krona), you have the option to provide a taxonomy file, such as `taxonomy.tab`, using the `--krona_db` parameter. If you don't supply a taxonomy file, Krona is designed to automatically download the required taxonomy data for visualization. + +The taxonomic classification of bins with GTDB-Tk is not guaranteed to be reproducible, since the placement of bins in the reference tree is non-deterministic. However, the authors of the GTDB-Tk article examined the reproducibility on a set of 100 genomes across 50 trials and did not observe any difference (see [https://doi.org/10.1093/bioinformatics/btz848](https://doi.org/10.1093/bioinformatics/btz848)). + ## Core Nextflow arguments > [!NOTE] @@ -167,8 +226,6 @@ You can also supply a run name to resume a specific run: `-resume [run-name]`. U Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. -## Custom configuration - ### Resource requests Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the pipeline steps, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher resources request (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. @@ -187,6 +244,52 @@ A pipeline might not always support every possible argument or option of a parti To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. +Note, do not change number of CPUs with custom config files for the processes `spades`, `spadeshybrid` or `megahit` when specifying the parameters `--spades_fix_cpus`, `--spadeshybrid_fix_cpus` and `--megahit_fix_cpu_1` respectively. + +> **NB:** We specify the full process name i.e. `NFCORE_RNASEQ:RNASEQ:ALIGN_STAR:STAR_ALIGN` in the config file because this takes priority over the short name (`STAR_ALIGN`) and allows existing configuration using the full process name to be correctly overridden. +> +> If you get a warning suggesting that the process selector isn't recognised check that the process name has been specified correctly. + +### Updating containers (advanced users) + +The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. + +1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) +2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) +3. Create the custom config accordingly: + +- For Docker: + + ```nextflow + process { + withName: PANGOLIN { + container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + +- For Singularity: + + ```nextflow + process { + withName: PANGOLIN { + container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' + } + } + ``` + +- For Conda: + + ```nextflow + process { + withName: PANGOLIN { + conda = 'bioconda::pangolin=3.0.5' + } + } + ``` + +> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. + ### nf-core/configs In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. @@ -212,3 +315,23 @@ We recommend adding the following line to your environment to limit this (typica ```bash NXF_OPTS='-Xms1g -Xmx4g' ``` + +## A note on the ancient DNA subworkflow + +nf-core/mag integrates an additional subworkflow to validate ancient DNA _de novo_ assembly: + +[Characteristic patterns of ancient DNA (aDNA) damage](<(https://doi.org/10.1073/pnas.0704665104)>), namely DNA fragmentation and cytosine deamination (observed as C-to-T transitions) are typically used to authenticate aDNA sequences. By identifying assembled contigs carrying typical aDNA damages using [PyDamage](https://github.com/maxibor/pydamage), nf-core/mag can report and distinguish ancient contigs from contigs carrying no aDNA damage. Furthermore, to mitigate the effect of aDNA damage on contig sequence assembly, [freebayes](https://github.com/freebayes/freebayes) in combination with [BCFtools](https://github.com/samtools/bcftools) are used to (re)call the variants from the reads aligned to the contigs, and (re)generate contig consensus sequences. + +## A note on bin refinement + +### Error Reporting + +DAS Tool may not always be able to refine bins due to insufficient recovery of enough single-copy genes. In these cases you will get a NOTE such as + +```bash +[16/d330a6] NOTE: Process `NFCORE_MAG:MAG:BINNING_REFINEMENT:DASTOOL_DASTOOL (test_minigut_sample2)` terminated with an error exit status (1) -- Error is ignored +``` + +In this case, DAS Tool has not necessarily failed but was unable to complete the refinement. You will therefore not expect to find any output files in the `GenomeBinning/DASTool/` results directory for that particular sample. + +If you are regularly getting such errors, you can try reducing the `--refine_bins_dastool_threshold` value, which will modify the scoring threshold defined in the [DAS Tool publication](https://www.nature.com/articles/s41564-018-0171-1). diff --git a/main.nf b/main.nf index 037c6fcb..3771fb1f 100644 --- a/main.nf +++ b/main.nf @@ -15,7 +15,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { MAG } from './workflows/mag' +include { MAG } from './workflows/mag' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_mag_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_mag_pipeline' include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_mag_pipeline' @@ -26,10 +26,10 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_mag_ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file +// TODO nf-core: Remove this line if you don't need a FASTA file [TODO: try and test using for --host_fasta and --host_genome] // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` -params.fasta = getGenomeAttribute('fasta') +// params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -43,7 +43,9 @@ params.fasta = getGenomeAttribute('fasta') workflow NFCORE_MAG { take: - samplesheet // channel: samplesheet read in from --input + raw_short_reads // channel: samplesheet read in from --input + raw_long_reads + input_assemblies main: @@ -51,7 +53,9 @@ workflow NFCORE_MAG { // WORKFLOW: Run pipeline // MAG ( - samplesheet + raw_short_reads, // channel: samplesheet read in from --input + raw_long_reads, + input_assemblies ) emit: multiqc_report = MAG.out.multiqc_report // channel: /path/to/multiqc_report.html @@ -81,7 +85,9 @@ workflow { // WORKFLOW: Run main workflow // NFCORE_MAG ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.raw_short_reads, + PIPELINE_INITIALISATION.out.raw_long_reads, + PIPELINE_INITIALISATION.out.input_assemblies ) // // SUBWORKFLOW: Run completion tasks diff --git a/modules.json b/modules.json index 7fd3aee1..05e3b3dd 100644 --- a/modules.json +++ b/modules.json @@ -5,20 +5,297 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "adapterremoval": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "aria2": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/aria2/aria2.diff" + }, + "bbmap/bbnorm": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, + "bcftools/consensus": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "bcftools/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "bcftools/view": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] + }, + "centrifuge/centrifuge": { + "branch": "master", + "git_sha": "9a07a1293d9b818d1e06d0f7b58152f74d462012", + "installed_by": ["modules"] + }, + "centrifuge/kreport": { + "branch": "master", + "git_sha": "9a07a1293d9b818d1e06d0f7b58152f74d462012", + "installed_by": ["modules"], + "patch": "modules/nf-core/centrifuge/kreport/centrifuge-kreport.diff" + }, + "checkm/lineagewf": { + "branch": "master", + "git_sha": "3ea318161b8788623cec477bde0f089180b2245b", + "installed_by": ["modules"] + }, + "checkm/qa": { + "branch": "master", + "git_sha": "867961a8ef91135475ca48c83743646038be4196", + "installed_by": ["modules"] + }, + "checkm2/databasedownload": { + "branch": "master", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", + "installed_by": ["modules"] + }, + "checkm2/predict": { + "branch": "master", + "git_sha": "e17652681c856afaf2e240ba4c98bf4631a0fe2d", + "installed_by": ["modules"] + }, + "chopper": { + "branch": "master", + "git_sha": "22737835af2db3dd0d5b6b332e75e160d0199fae", + "installed_by": ["modules"] + }, + "concoct/concoct": { + "branch": "master", + "git_sha": "baa30accc6c50ea8a98662417d4f42ed18966353", + "installed_by": ["fasta_binning_concoct"] + }, + "concoct/concoctcoveragetable": { + "branch": "master", + "git_sha": "baa30accc6c50ea8a98662417d4f42ed18966353", + "installed_by": ["fasta_binning_concoct"] + }, + "concoct/cutupfasta": { + "branch": "master", + "git_sha": "73a6d7e6077b88aba1c5d6805635d79d6718270c", + "installed_by": ["fasta_binning_concoct"] + }, + "concoct/extractfastabins": { + "branch": "master", + "git_sha": "baa30accc6c50ea8a98662417d4f42ed18966353", + "installed_by": ["fasta_binning_concoct"] + }, + "concoct/mergecutupclustering": { + "branch": "master", + "git_sha": "baa30accc6c50ea8a98662417d4f42ed18966353", + "installed_by": ["fasta_binning_concoct"] + }, + "dastool/dastool": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "dastool/fastatocontig2bin": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "dc94b6ee04a05ddb9f7ae050712ff30a13149164", "installed_by": ["modules"] }, + "filtlong": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "freebayes": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "genomad/download": { + "branch": "master", + "git_sha": "ca813f3f73adedf3547a5a677e992d9d43a71870", + "installed_by": ["modules"] + }, + "genomad/endtoend": { + "branch": "master", + "git_sha": "ca813f3f73adedf3547a5a677e992d9d43a71870", + "installed_by": ["modules"] + }, + "gtdbtk/classifywf": { + "branch": "master", + "git_sha": "7b9ce4b817926f17ec82cc0099d2d0ff095a2fac", + "installed_by": ["modules"] + }, + "gunc/downloaddb": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "gunc/mergecheckm": { + "branch": "master", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", + "installed_by": ["modules"] + }, + "gunc/run": { + "branch": "master", + "git_sha": "b6515a01897b11b64b3368858c0359b4c813ad1e", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", + "installed_by": ["modules"] + }, + "krakentools/kreport2krona": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "krona/kronadb": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "krona/ktimporttaxonomy": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "maxbin2": { + "branch": "master", + "git_sha": "283613159e079152f1336cef0db1c836086206e0", + "installed_by": ["modules"] + }, + "megahit": { + "branch": "master", + "git_sha": "9142b390538283705c084e4d612170972ff60326", + "installed_by": ["modules"] + }, + "metabat2/jgisummarizebamcontigdepths": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "metabat2/metabat2": { + "branch": "master", + "git_sha": "d2e220fdec3aa2f4482c70017df4cdf8a4c94f27", + "installed_by": ["modules"] + }, + "metaeuk/easypredict": { + "branch": "master", + "git_sha": "30d06da5bd7ae67be32758bf512cd75a4325d386", + "installed_by": ["modules"] + }, + "mmseqs/databases": { + "branch": "master", + "git_sha": "699e078133f580548aeb43114f93ac29928c6143", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", "installed_by": ["modules"] + }, + "nanolyse": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "nanoplot": { + "branch": "master", + "git_sha": "3135090b46f308a260fc9d5991d7d2f9c0785309", + "installed_by": ["modules"] + }, + "nanoq": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "porechop/abi": { + "branch": "master", + "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48", + "installed_by": ["modules"] + }, + "porechop/porechop": { + "branch": "master", + "git_sha": "1d68c7f248d1a480c5959548a9234602b771199e", + "installed_by": ["modules"] + }, + "prodigal": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "prokka": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "pydamage/analyze": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "pydamage/filter": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "installed_by": ["modules"] + }, + "seqtk/mergepe": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "spades": { + "branch": "master", + "git_sha": "cfebb244d8c83ae533bf2db399f9af361927d504", + "installed_by": ["modules"] + }, + "tiara/tiara": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "installed_by": ["modules"] } } }, "subworkflows": { "nf-core": { + "fasta_binning_concoct": { + "branch": "master", + "git_sha": "c60c14b285b89bdd0607e371417dadb80385ad6e", + "installed_by": ["subworkflows"] + }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", diff --git a/modules/local/adjust_maxbin2_ext.nf b/modules/local/adjust_maxbin2_ext.nf new file mode 100644 index 00000000..70eae99c --- /dev/null +++ b/modules/local/adjust_maxbin2_ext.nf @@ -0,0 +1,27 @@ +process ADJUST_MAXBIN2_EXT { + tag "${meta.assembler}-${meta.id}" + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(bins) + + output: + tuple val(meta), path("*.fa.gz"), emit: renamed_bins + + script: + """ + if [ -n "${bins}" ] + then + for file in ${bins}; do + [[ \${file} =~ (.*).fasta.gz ]]; + bin="\${BASH_REMATCH[1]}" + mv \${file} \${bin}.fa.gz + done + fi + """ +} diff --git a/modules/local/bin_summary.nf b/modules/local/bin_summary.nf new file mode 100644 index 00000000..07784d83 --- /dev/null +++ b/modules/local/bin_summary.nf @@ -0,0 +1,41 @@ +process BIN_SUMMARY { + + conda "conda-forge::pandas=1.4.3" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' + : 'biocontainers/pandas:1.4.3'}" + + input: + path bin_depths + path binqc_sum + path quast_sum + path gtdbtk_sum + path cat_sum + val binqc_tool + + output: + path "bin_summary.tsv", emit: summary + path "versions.yml" , emit: versions + + script: + def binqc_summary = binqc_sum.sort().size() > 0 ? "--binqc_summary ${binqc_sum}" : "" + def quast_summary = quast_sum.sort().size() > 0 ? "--quast_summary ${quast_sum}" : "" + def gtdbtk_summary = gtdbtk_sum.sort().size() > 0 ? "--gtdbtk_summary ${gtdbtk_sum}" : "" + def cat_summary = cat_sum.sort().size() > 0 ? "--cat_summary ${cat_sum}" : "" + """ + combine_tables.py \ + --depths_summary ${bin_depths} \ + ${binqc_summary} \ + ${quast_summary} \ + ${gtdbtk_summary} \ + ${cat_summary} \ + --binqc_tool ${binqc_tool} \ + --out bin_summary.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/bowtie2_assembly_align.nf b/modules/local/bowtie2_assembly_align.nf new file mode 100644 index 00000000..951dfb8d --- /dev/null +++ b/modules/local/bowtie2_assembly_align.nf @@ -0,0 +1,44 @@ +process BOWTIE2_ASSEMBLY_ALIGN { + tag "${assembly_meta.assembler}-${assembly_meta.id}-${reads_meta.id}" + + conda "bioconda::bowtie2=2.4.2 bioconda::samtools=1.11 conda-forge::pigz=2.3.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0' : + 'biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0' }" + + input: + tuple val(assembly_meta), path(assembly), path(index), val(reads_meta), path(reads) + + output: + tuple val(assembly_meta), path(assembly), path("${assembly_meta.assembler}-${assembly_meta.id}-${reads_meta.id}.bam"), path("${assembly_meta.assembler}-${assembly_meta.id}-${reads_meta.id}.bam.bai"), emit: mappings + tuple val(assembly_meta), val(reads_meta), path("*.bowtie2.log") , emit: log + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def name = "${assembly_meta.assembler}-${assembly_meta.id}-${reads_meta.id}" + def input = params.single_end ? "-U \"${reads}\"" : "-1 \"${reads[0]}\" -2 \"${reads[1]}\"" + """ + INDEX=`find -L ./ -name "*.rev.1.bt2l" -o -name "*.rev.1.bt2" | sed 's/.rev.1.bt2l//' | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -p "${task.cpus}" \\ + -x \$INDEX \\ + $args \\ + $input \\ + 2> "${name}.bowtie2.log" | \ + samtools view -@ "${task.cpus}" -bS | \ + samtools sort -@ "${task.cpus}" -o "${name}.bam" + samtools index "${name}.bam" + + if [ ${name} = "${assembly_meta.assembler}-${assembly_meta.id}-${assembly_meta.id}" ] ; then + mv "${name}.bowtie2.log" "${assembly_meta.assembler}-${assembly_meta.id}.bowtie2.log" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/local/bowtie2_assembly_build.nf b/modules/local/bowtie2_assembly_build.nf new file mode 100644 index 00000000..1f305f70 --- /dev/null +++ b/modules/local/bowtie2_assembly_build.nf @@ -0,0 +1,27 @@ +process BOWTIE2_ASSEMBLY_BUILD { + tag "${meta.assembler}-${meta.id}" + + conda "bioconda::bowtie2=2.4.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + + input: + tuple val(meta), path(assembly) + + output: + tuple val(meta), path(assembly), path('bt2_index_base*'), emit: assembly_index + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie + bowtie2-build --threads $task.cpus $assembly "bt2_index_base" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/bowtie2_removal_align.nf b/modules/local/bowtie2_removal_align.nf new file mode 100644 index 00000000..79eb0e47 --- /dev/null +++ b/modules/local/bowtie2_removal_align.nf @@ -0,0 +1,73 @@ +/* + * Bowtie2 for read removal + */ +process BOWTIE2_REMOVAL_ALIGN { + tag "$meta.id" + + conda "bioconda::bowtie2=2.4.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + + input: + tuple val(meta), path(reads) + path index + + output: + tuple val(meta), path("*.unmapped*.fastq.gz") , emit: reads + path "*.mapped*.read_ids.txt", optional:true , emit: read_ids + tuple val(meta), path("*.bowtie2.log") , emit: log + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def save_ids = (args2.contains('--host_removal_save_ids')) ? "Y" : "N" + if (!meta.single_end){ + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 -p ${task.cpus} \ + -x \$INDEX \ + -1 "${reads[0]}" -2 "${reads[1]}" \ + $args \ + --un-conc-gz ${prefix}.unmapped_%.fastq.gz \ + --al-conc-gz ${prefix}.mapped_%.fastq.gz \ + 1> /dev/null \ + 2> ${prefix}.bowtie2.log + if [ ${save_ids} = "Y" ] ; then + gunzip -c ${prefix}.mapped_1.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${prefix}.mapped_1.read_ids.txt + gunzip -c ${prefix}.mapped_2.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${prefix}.mapped_2.read_ids.txt + fi + rm -f ${prefix}.mapped_*.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + bowtie2 -p ${task.cpus} \ + -x ${index[0].getSimpleName()} \ + -U ${reads} \ + $args \ + --un-gz ${prefix}.unmapped.fastq.gz \ + --al-gz ${prefix}.mapped.fastq.gz \ + 1> /dev/null \ + 2> ${prefix}.bowtie2.log + if [ ${save_ids} = "Y" ] ; then + gunzip -c ${prefix}.mapped.fastq.gz | awk '{if(NR%4==1) print substr(\$0, 2)}' | LC_ALL=C sort > ${prefix}.mapped.read_ids.txt + fi + rm -f ${prefix}.mapped.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/local/bowtie2_removal_build.nf b/modules/local/bowtie2_removal_build.nf new file mode 100644 index 00000000..f3922094 --- /dev/null +++ b/modules/local/bowtie2_removal_build.nf @@ -0,0 +1,27 @@ +process BOWTIE2_REMOVAL_BUILD { + tag "$fasta" + + conda "bioconda::bowtie2=2.4.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.2--py38h1c8e9b9_1' : + 'biocontainers/bowtie2:2.4.2--py38h1c8e9b9_1' }" + + input: + path fasta + + output: + path "*.bt2" , emit: index + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie + bowtie2-build --threads $task.cpus $fasta ${fasta.simpleName} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/local/busco.nf b/modules/local/busco.nf new file mode 100644 index 00000000..4d0a561d --- /dev/null +++ b/modules/local/busco.nf @@ -0,0 +1,59 @@ +process BUSCO { + tag "${bin}" + + conda "bioconda::busco=5.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/busco:5.4.3--pyhdfd78af_0': + 'biocontainers/busco:5.4.3--pyhdfd78af_0' }" + + input: + tuple val(meta), path(bin) + tuple val(db_meta), path(db) + + output: + tuple val(meta), path("short_summary.domain.*.${bin}.txt") , optional:true , emit: summary_domain + tuple val(meta), path("short_summary.specific_lineage.*.${bin}.txt"), optional:true , emit: summary_specific + tuple env(most_spec_db), path('busco_downloads/') , optional:true , emit: busco_downloads + path("${bin}_busco.log") + path("${bin}_busco.err") + path("${bin}_buscos.*.faa.gz") , optional:true + path("${bin}_buscos.*.fna.gz") , optional:true + path("${bin}_prodigal.gff") , optional:true , emit: prodigal_genes + tuple val(meta), path("${bin}_busco.failed_bin.txt") , optional:true , emit: failed_bin + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def cp_augustus_config = workflow.profile.toString().indexOf("conda") != -1 ? "N" : "Y" + def lineage_dataset_provided = "${db_meta.lineage}" + def busco_clean = params.busco_clean ? "Y" : "N" + + def p = params.busco_auto_lineage_prok ? "--auto-lineage-prok" : "--auto-lineage" + if ( "${lineage_dataset_provided}" == "Y" ) { + p = "--lineage_dataset dataset/${db}" + } else if ( "${lineage_dataset_provided}" == "N" ) { + p += " --offline --download_path ${db}" + } else { + lineage_dataset_provided = "" + } + """ + run_busco.sh \\ + "${p}" \\ + "${cp_augustus_config}" \\ + "${db}" \\ + "${bin}" \\ + ${task.cpus} \\ + "${lineage_dataset_provided}" \\ + "${busco_clean}" \\ + "${args}" + + most_spec_db=\$( versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + R: \$(R --version 2>&1 | sed -n 1p | sed 's/R version //' | sed 's/ (.*//') + busco: \$(busco --version 2>&1 | sed 's/BUSCO //g') + END_VERSIONS + """ +} diff --git a/modules/local/busco_db_preparation.nf b/modules/local/busco_db_preparation.nf new file mode 100644 index 00000000..e3418cb6 --- /dev/null +++ b/modules/local/busco_db_preparation.nf @@ -0,0 +1,26 @@ +process BUSCO_DB_PREPARATION { + tag "${database.baseName}" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path database + + output: + tuple val("${database.getSimpleName()}"), path("buscodb/*"), emit: db + path "versions.yml" , emit: versions + + script: + """ + mkdir buscodb + tar -xf ${database} -C buscodb + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') + END_VERSIONS + """ +} diff --git a/modules/local/busco_save_download.nf b/modules/local/busco_save_download.nf new file mode 100644 index 00000000..099c4150 --- /dev/null +++ b/modules/local/busco_save_download.nf @@ -0,0 +1,24 @@ +process BUSCO_SAVE_DOWNLOAD { + // execute sequentially to avoid artefacts when saving files for multiple busco instances + maxForks 1 + + conda "conda-forge::bash=5.2.21" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' + : 'nf-core/ubuntu:20.04' }" + + input: + path(busco_downloads) + + output: + path 'busco_downloads/**', includeInputs: true, emit: busco_files + path 'versions.yml' , emit: versions + + script: + """ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$BASH_VERSION) + END_VERSIONS + """ +} diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf new file mode 100644 index 00000000..bafcc495 --- /dev/null +++ b/modules/local/busco_summary.nf @@ -0,0 +1,35 @@ +process BUSCO_SUMMARY { + + conda "conda-forge::pandas=1.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : + 'biocontainers/pandas:1.4.3' }" + + input: + path(summaries_domain) + path(summaries_specific) + path(failed_bins) + + output: + path "busco_summary.tsv", emit: summary + path "versions.yml" , emit: versions + + script: + def reference = params.busco_db.toString().contains('odb10') + def auto = reference ? "" : "-a" + def ss = summaries_specific.sort().size() > 0 ? "-ss ${summaries_specific}" : "" + def sd = summaries_domain.sort().size() > 0 ? "-sd ${summaries_domain}" : "" + def f = "" + if ("${reference}" == false && failed_bins.sort().size() > 0) + f = "-f ${failed_bins}" + """ + summary_busco.py $auto $ss $sd $f -o busco_summary.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} + diff --git a/modules/local/cat.nf b/modules/local/cat.nf new file mode 100644 index 00000000..90d44ae5 --- /dev/null +++ b/modules/local/cat.nf @@ -0,0 +1,48 @@ +process CAT { + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}-${db_name}" + + conda "bioconda::cat=5.2.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'biocontainers/cat:5.2.3--hdfd78af_1' }" + + input: + tuple val(meta), path("bins/*") + tuple val(db_name), path("database/*"), path("taxonomy/*") + + output: + tuple val(meta), path("*.bin2classification.names.txt") , emit: tax_classification_names + path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification + path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca + path("raw/*.predicted_proteins.faa.gz") , emit: faa + path("raw/*.predicted_proteins.gff.gz") , emit: gff + path("raw/*.log") , emit: log + path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids + path "versions.yml" , emit: versions + + script: + def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : "" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + """ + CAT bins $args -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${prefix}" --I_know_what_Im_doing + CAT add_names -i "${prefix}.ORF2LCA.txt" -o "${prefix}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} + CAT add_names -i "${prefix}.bin2classification.txt" -o "${prefix}.bin2classification.names.txt" -t taxonomy/ ${official_taxonomy} + + mkdir raw + mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/ + cp *.bin2classification.names.txt raw/ + gzip "raw/${prefix}.ORF2LCA.txt" \ + "raw/${prefix}.concatenated.predicted_proteins.faa" \ + "raw/${prefix}.concatenated.predicted_proteins.gff" \ + "raw/${prefix}.bin2classification.txt" \ + "${prefix}.ORF2LCA.names.txt" \ + "raw/${prefix}.bin2classification.names.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CAT: \$(CAT --version | sed "s/CAT v//; s/(.*//") + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/local/cat_db.nf b/modules/local/cat_db.nf new file mode 100644 index 00000000..dac96bb0 --- /dev/null +++ b/modules/local/cat_db.nf @@ -0,0 +1,33 @@ +process CAT_DB { + tag "${database.baseName}" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path(database) + + output: + tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*"), path("taxonomy/*"), emit: db + path "versions.yml" , emit: versions + + script: + """ + if [[ ${database} != *.tar.gz ]]; then + ln -sr `find ${database}/ -type d -name "*taxonomy*"` taxonomy + ln -sr `find ${database}/ -type d -name "*database*"` database + else + mkdir catDB + tar -xf ${database} -C catDB + mv `find catDB/ -type d -name "*taxonomy*"` taxonomy/ + mv `find catDB/ -type d -name "*database*"` database/ + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') + END_VERSIONS + """ +} diff --git a/modules/local/cat_db_generate.nf b/modules/local/cat_db_generate.nf new file mode 100644 index 00000000..eaf6c1b4 --- /dev/null +++ b/modules/local/cat_db_generate.nf @@ -0,0 +1,36 @@ +process CAT_DB_GENERATE { + + conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' : + 'biocontainers/mulled-v2-75e2a26f10cbf3629edf2d1600db3fed5ebe6e04:eae321284604f7dabbdf121e3070bda907b91266-0' }" + + output: + tuple env(DB_NAME), path("database/*"), path("taxonomy/*"), emit: db + path("CAT_prepare_*.tar.gz"), optional:true , emit: db_tar_gz + path "versions.yml" , emit: versions + + script: + def save_db = params.save_cat_db ? "Y" : "N" + """ + CAT prepare --fresh + + # get name/date of generated datase + out=(*_taxonomy/) + [[ \$out =~ (.*)_taxonomy/ ]]; + DB_NAME="CAT_prepare_\${BASH_REMATCH[1]}" + + mv *_taxonomy taxonomy + mv *_database database + rm database/*.nr.gz + if [ ${save_db} = "Y" ] ; then + tar -cf - taxonomy database | gzip > "\${DB_NAME}".tar.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CAT: \$(CAT --version | sed "s/CAT v//; s/(.*//") + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/local/cat_summary.nf b/modules/local/cat_summary.nf new file mode 100644 index 00000000..5f8631fa --- /dev/null +++ b/modules/local/cat_summary.nf @@ -0,0 +1,26 @@ +process CAT_SUMMARY { + label 'process_low' + + conda "bioconda::bioawk=1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : + 'biocontainers/bioawk:1.0--hed695b0_5' }" + + input: + path(cat_summaries) + + output: + path("*.tsv") , emit: combined + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "cat_summary" + """ + bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/local/centrifuge.nf b/modules/local/centrifuge.nf new file mode 100644 index 00000000..c6618417 --- /dev/null +++ b/modules/local/centrifuge.nf @@ -0,0 +1,37 @@ +process CENTRIFUGE { + tag "${meta.id}-${db_name}" + + conda "bioconda::centrifuge=1.0.4_beta" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4_beta--he513fc3_5' : + 'biocontainers/centrifuge:1.0.4_beta--he513fc3_5' }" + + input: + tuple val(meta), path(reads) + tuple val(db_name), path(db) + + output: + tuple val("centrifuge"), val(meta), path("results.krona"), emit: results_for_krona + path "report.txt" , emit: report + tuple val(meta), path("*kreport.txt") , emit: kreport + path "versions.yml" , emit: versions + + script: + def input = meta.single_end ? "-U \"${reads}\"" : "-1 \"${reads[0]}\" -2 \"${reads[1]}\"" + prefix = task.ext.prefix ?: "${meta.id}" + + """ + centrifuge -x "${db_name}" \ + -p ${task.cpus} \ + --report-file report.txt \ + -S results.txt \ + $input + centrifuge-kreport -x "${db_name}" results.txt > ${prefix}.centrifuge_kreport.txt + cat results.txt | cut -f 1,3 > results.krona + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$(centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/local/combine_tsv.nf b/modules/local/combine_tsv.nf new file mode 100644 index 00000000..1fe7ec1a --- /dev/null +++ b/modules/local/combine_tsv.nf @@ -0,0 +1,26 @@ +process COMBINE_TSV { + + // Using bioawk as already use that for CONVERT_DEPTHS and does same thing + conda "bioconda::bioawk=1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : + 'biocontainers/bioawk:1.0--hed695b0_5' }" + + input: + path(bin_summaries, stageAs: "bin_summaries/*.tsv") + + output: + path("*.tsv") , emit: combined + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "bin_depths_summary_combined" + """ + bioawk '(NR == 1) || (FNR > 1)' ${bin_summaries} > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/local/convert_depths.nf b/modules/local/convert_depths.nf new file mode 100644 index 00000000..0c54e5c6 --- /dev/null +++ b/modules/local/convert_depths.nf @@ -0,0 +1,41 @@ +process CONVERT_DEPTHS { + tag "${meta.id}" + + conda "bioconda::bioawk=1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : + 'biocontainers/bioawk:1.0--hed695b0_5' }" + + input: + tuple val(meta), path(fasta), path(depth) + + output: + // need to add empty val because representing reads as we dont want maxbin to calculate for us. + tuple val(meta), path(fasta), val([]), path("*.abund"), emit: output + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gunzip -f $depth + + # Determine the number of abundance columns + n_abund=\$(awk 'NR==1 {print int((NF-3)/2)}' ${depth.toString() - '.gz'}) + + # Get column names + read -r header<${depth.toString() - '.gz'} + header=(\$header) + + # Generate abundance files for each read set + for i in \$(seq 1 \$n_abund); do + col=\$((i*2+2)) + name=\$( echo \${header[\$col-1]} | sed s/\\.bam\$// ) + bioawk -t '{if (NR > 1) {print \$1, \$'"\$col"'}}' ${depth.toString() - '.gz'} > \${name}.abund + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/local/gtdbtk_db_preparation.nf b/modules/local/gtdbtk_db_preparation.nf new file mode 100644 index 00000000..3be79c96 --- /dev/null +++ b/modules/local/gtdbtk_db_preparation.nf @@ -0,0 +1,25 @@ +process GTDBTK_DB_PREPARATION { + tag "${database}" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path(database) + + output: + tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*"), emit: db + + script: + """ + mkdir database + tar -xzf ${database} -C database --strip 1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') + END_VERSIONS + """ +} diff --git a/modules/local/gtdbtk_summary.nf b/modules/local/gtdbtk_summary.nf new file mode 100644 index 00000000..52c0a40d --- /dev/null +++ b/modules/local/gtdbtk_summary.nf @@ -0,0 +1,34 @@ +process GTDBTK_SUMMARY { + + + conda "conda-forge::pandas=1.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : + 'biocontainers/pandas:1.4.3' }" + + input: + path(qc_discarded_bins) + path(gtdbtk_summaries) + path(filtered_bins) + path(failed_bins) + + output: + path "gtdbtk_summary.tsv", emit: summary + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def discarded = qc_discarded_bins.sort().size() > 0 ? "--qc_discarded_bins ${qc_discarded_bins}" : "" + def summaries = gtdbtk_summaries.sort().size() > 0 ? "--summaries ${gtdbtk_summaries}" : "" + def filtered = filtered_bins.sort().size() > 0 ? "--filtered_bins ${filtered_bins}" : "" + def failed = failed_bins.sort().size() > 0 ? "--failed_bins ${failed_bins}" : "" + """ + summary_gtdbtk.py $args $discarded $summaries $filtered $failed --out gtdbtk_summary.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/kraken2.nf b/modules/local/kraken2.nf new file mode 100644 index 00000000..b67118a9 --- /dev/null +++ b/modules/local/kraken2.nf @@ -0,0 +1,37 @@ +process KRAKEN2 { + tag "${meta.id}-${db_name}" + + conda "bioconda::kraken2=2.0.8_beta" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kraken2:2.0.8_beta--pl526hc9558a2_2' : + 'biocontainers/kraken2:2.0.8_beta--pl526hc9558a2_2' }" + + input: + tuple val(meta), path(reads) + tuple val(db_name), path("database/*") + + output: + tuple val("kraken2"), val(meta), path("results.krona"), emit: results_for_krona + tuple val(meta), path("*kraken2_report.txt") , emit: report + path "versions.yml" , emit: versions + + script: + def input = meta.single_end ? "\"${reads}\"" : "--paired \"${reads[0]}\" \"${reads[1]}\"" + prefix = task.ext.prefix ?: "${meta.id}" + + """ + kraken2 \ + --report-zero-counts \ + --threads ${task.cpus} \ + --db database \ + --report ${prefix}.kraken2_report.txt \ + $input \ + > kraken2.kraken + cat kraken2.kraken | cut -f 2,3 > results.krona + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //' | sed 's/ Copyright.*//') + END_VERSIONS + """ +} diff --git a/modules/local/kraken2_db_preparation.nf b/modules/local/kraken2_db_preparation.nf new file mode 100644 index 00000000..5ae68b7f --- /dev/null +++ b/modules/local/kraken2_db_preparation.nf @@ -0,0 +1,27 @@ +process KRAKEN2_DB_PREPARATION { + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path db + + output: + tuple val("${db.simpleName}"), path("database/*.k2d"), emit: db + path "versions.yml" , emit: versions + + script: + """ + mkdir db_tmp + tar -xf "${db}" -C db_tmp + mkdir database + mv `find db_tmp/ -name "*.k2d"` database/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') + END_VERSIONS + """ +} diff --git a/modules/local/krona.nf b/modules/local/krona.nf new file mode 100644 index 00000000..827cbc4a --- /dev/null +++ b/modules/local/krona.nf @@ -0,0 +1,28 @@ +process KRONA { + tag "${meta.classifier}-${meta.id}" + + conda "bioconda::krona=2.7.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.7.1--pl526_5' : + 'biocontainers/krona:2.7.1--pl526_5' }" + + input: + tuple val(meta), path(report) + path(taxonomy_file), stageAs: 'taxonomy.tab' + + output: + tuple val(meta), path("*.html") , emit: html + path "versions.yml" , emit: versions + + script: + """ + TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;) + + ktImportTaxonomy ${report} -tax \$TAXONOMY/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ktImportTaxonomy: \$(ktImportTaxonomy 2>&1 | sed -n '/KronaTools /p' | sed 's/^.*KronaTools //; s/ - ktImportTaxonomy.*//') + END_VERSIONS + """ +} diff --git a/modules/local/mag_depths.nf b/modules/local/mag_depths.nf new file mode 100644 index 00000000..2ee63523 --- /dev/null +++ b/modules/local/mag_depths.nf @@ -0,0 +1,31 @@ +process MAG_DEPTHS { + tag "${meta.assembler}-${meta.binner}-${meta.id}" + + // Using container from metabat2 process, since this will be anyway already downloaded and contains biopython and pandas + conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' : + 'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" + + input: + tuple val(meta), path(bins), path(contig_depths) + + output: + tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}-binDepths.tsv"), emit: depths + path "versions.yml" , emit: versions + + script: + """ + get_mag_depths.py --bins ${bins} \\ + --depths ${contig_depths} \\ + --assembler ${meta.assembler} \\ + --id ${meta.id} \\ + --binner ${meta.binner} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/mag_depths_plot.nf b/modules/local/mag_depths_plot.nf new file mode 100644 index 00000000..2291ca2d --- /dev/null +++ b/modules/local/mag_depths_plot.nf @@ -0,0 +1,29 @@ +process MAG_DEPTHS_PLOT { + tag "${meta.assembler}-${meta.binner}-${meta.id}" + conda "conda-forge::python=3.9 conda-forge::pandas=1.3.0 conda-forge::seaborn=0.11.0 conda-forge::matplotlib=3.4.2" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/mulled-v2-d14219255233ee6cacc427e28a7caf8ee42e8c91:0a22c7568e4a509925048454dad9ab37fa8fe776-0' + : 'biocontainers/mulled-v2-d14219255233ee6cacc427e28a7caf8ee42e8c91:0a22c7568e4a509925048454dad9ab37fa8fe776-0'}" + + input: + tuple val(meta), path(depths) + path sample_groups + + output: + tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}-binDepths.heatmap.png"), emit: heatmap + path "versions.yml", emit: versions + + script: + """ + plot_mag_depths.py --bin_depths ${depths} \ + --groups ${sample_groups} \ + --out "${meta.assembler}-${meta.binner}-${meta.id}-binDepths.heatmap.png" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + seaborn: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('seaborn').version)") + END_VERSIONS + """ +} diff --git a/modules/local/mag_depths_summary.nf b/modules/local/mag_depths_summary.nf new file mode 100644 index 00000000..1be7becc --- /dev/null +++ b/modules/local/mag_depths_summary.nf @@ -0,0 +1,27 @@ +process MAG_DEPTHS_SUMMARY { + + conda "conda-forge::pandas=1.4.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.4.3' : + 'biocontainers/pandas:1.4.3' }" + + input: + path(mag_depths) + + output: + path("${prefix}.tsv"), emit: summary + path "versions.yml" , emit: versions + + script: + prefix = task.ext.prefix ?: "bin_depths_summary" + """ + get_mag_depths_summary.py --depths ${mag_depths} \ + --out "${prefix}.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/nanolyse.nf b/modules/local/nanolyse.nf new file mode 100644 index 00000000..9e800ef0 --- /dev/null +++ b/modules/local/nanolyse.nf @@ -0,0 +1,30 @@ +process NANOLYSE { + tag "$meta.id" + + conda "bioconda::nanolyse=1.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/nanolyse:1.1.0--py36_1' : + 'biocontainers/nanolyse:1.1.0--py36_1' }" + + input: + tuple val(meta), path(reads) + path nanolyse_db + + output: + tuple val(meta), path("${meta.id}_nanolyse.fastq.gz"), emit: reads + path "${meta.id}_nanolyse.log" , emit: log + path "versions.yml" , emit: versions + + script: + """ + zcat ${reads} | NanoLyse --reference $nanolyse_db | gzip > ${meta.id}_nanolyse.fastq.gz + echo "NanoLyse reference: $params.lambda_reference" >${meta.id}_nanolyse.log + zcat ${reads} | echo "total reads before NanoLyse: \$((`wc -l`/4))" >>${meta.id}_nanolyse.log + gunzip -c ${meta.id}_nanolyse.fastq.gz | echo "total reads after NanoLyse: \$((`wc -l`/4))" >> ${meta.id}_nanolyse.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + NanoLyse: \$(NanoLyse --version | sed -e "s/NanoLyse //g") + END_VERSIONS + """ +} diff --git a/modules/local/pool_paired_reads.nf b/modules/local/pool_paired_reads.nf new file mode 100644 index 00000000..9e73028e --- /dev/null +++ b/modules/local/pool_paired_reads.nf @@ -0,0 +1,26 @@ +process POOL_PAIRED_READS { + tag "$meta.id" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads1), path(reads2) + + output: + tuple val(meta), path("pooled_${meta.id}_*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + script: + """ + cat ${reads1} > "pooled_${meta.id}_1.fastq.gz" + cat ${reads2} > "pooled_${meta.id}_2.fastq.gz" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(cat --version 2>&1 | sed -n 1p | sed 's/cat (GNU coreutils) //') + END_VERSIONS + """ +} diff --git a/modules/local/pool_single_reads.nf b/modules/local/pool_single_reads.nf new file mode 100644 index 00000000..3ab6cc7c --- /dev/null +++ b/modules/local/pool_single_reads.nf @@ -0,0 +1,25 @@ +process POOL_SINGLE_READS { + tag "$meta.id" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("pooled_${meta.id}.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + script: + """ + cat ${reads} > "pooled_${meta.id}.fastq.gz" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(cat --version 2>&1 | sed -n 1p | sed 's/cat (GNU coreutils) //') + END_VERSIONS + """ +} diff --git a/modules/local/quast.nf b/modules/local/quast.nf new file mode 100644 index 00000000..4b68f412 --- /dev/null +++ b/modules/local/quast.nf @@ -0,0 +1,28 @@ +process QUAST { + tag "${meta.assembler}-${meta.id}" + + conda "bioconda::quast=5.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/quast:5.0.2--py37pl526hb5aa323_2' : + 'biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" + + input: + tuple val(meta), path(assembly) + + output: + path "QUAST/*" , emit: qc + path "QUAST/report_rawassemblies.tsv", emit: report + path "versions.yml" , emit: versions + + script: + """ + metaquast.py --threads "${task.cpus}" --rna-finding --max-ref-number 0 -l "${meta.assembler}-${meta.id}" "${assembly}" -o "QUAST" + cp QUAST/report.tsv QUAST/report_rawassemblies.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + metaquast: \$(metaquast.py --version | sed "s/QUAST v//; s/ (MetaQUAST mode)//") + END_VERSIONS + """ +} diff --git a/modules/local/quast_bins.nf b/modules/local/quast_bins.nf new file mode 100644 index 00000000..30117404 --- /dev/null +++ b/modules/local/quast_bins.nf @@ -0,0 +1,37 @@ +process QUAST_BINS { + tag "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + + conda "bioconda::quast=5.0.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/quast:5.0.2--py37pl526hb5aa323_2' : + 'biocontainers/quast:5.0.2--py37pl526hb5aa323_2' }" + + input: + tuple val(meta), path(bins) + + output: + path "QUAST/*", type: 'dir' , emit: dir + tuple val(meta), path("QUAST/*-quast_summary.tsv"), emit: quast_bin_summaries + path "versions.yml" , emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" + """ + BINS=\$(echo \"$bins\" | sed 's/[][]//g') + IFS=', ' read -r -a bins <<< \"\$BINS\" + for bin in \"\${bins[@]}\"; do + metaquast.py --threads "${task.cpus}" --max-ref-number 0 --rna-finding --gene-finding -l "\${bin}" "\${bin}" -o "QUAST/\${bin}" + if ! [ -f "QUAST/${prefix}-quast_summary.tsv" ]; then + cp "QUAST/\${bin}/transposed_report.tsv" "QUAST/${prefix}-quast_summary.tsv" + else + tail -n +2 "QUAST/\${bin}/transposed_report.tsv" >> "QUAST/${prefix}-quast_summary.tsv" + fi + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + metaquast: \$(metaquast.py --version | sed "s/QUAST v//; s/ (MetaQUAST mode)//") + END_VERSIONS + """ +} diff --git a/modules/local/quast_bins_summary.nf b/modules/local/quast_bins_summary.nf new file mode 100644 index 00000000..8b1734df --- /dev/null +++ b/modules/local/quast_bins_summary.nf @@ -0,0 +1,32 @@ +process QUAST_BINS_SUMMARY { + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + path(summaries) + + output: + path("quast_summary.tsv"), emit: summary + path "versions.yml" , emit: versions + + script: + """ + QUAST_BIN=\$(echo \"$summaries\" | sed 's/[][]//g') + IFS=', ' read -r -a quast_bin <<< \"\$QUAST_BIN\" + for quast_file in \"\${quast_bin[@]}\"; do + if ! [ -f "quast_summary.tsv" ]; then + cp "\${quast_file}" "quast_summary.tsv" + else + tail -n +2 "\${quast_file}" >> "quast_summary.tsv" + fi + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$(sed --version 2>&1 | sed -n 1p | sed 's/sed (GNU sed) //') + END_VERSIONS + """ +} diff --git a/modules/local/rename_postdastool.nf b/modules/local/rename_postdastool.nf new file mode 100644 index 00000000..6129dfbe --- /dev/null +++ b/modules/local/rename_postdastool.nf @@ -0,0 +1,23 @@ +process RENAME_POSTDASTOOL { + tag "${meta.assembler}-${meta.id}" + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(bins) + + output: + tuple val(meta), path("${meta.assembler}-*Refined-${meta.id}.*.fa", includeInputs: true), optional:true, emit: refined_bins + tuple val(meta), path("${meta.assembler}-DASToolUnbinned-${meta.id}.fa"), optional:true, emit: refined_unbins + + script: + """ + if [[ -f unbinned.fa ]]; then + mv unbinned.fa ${meta.assembler}-DASToolUnbinned-${meta.id}.fa + fi + """ +} diff --git a/modules/local/rename_predastool.nf b/modules/local/rename_predastool.nf new file mode 100644 index 00000000..3d9373b8 --- /dev/null +++ b/modules/local/rename_predastool.nf @@ -0,0 +1,31 @@ +process RENAME_PREDASTOOL { + tag "${meta.assembler}-${meta.binner}-${meta.id}" + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(bins) + + output: + tuple val(meta), path("${meta.assembler}-${meta.binner}Refined-${meta.id}*"), emit: renamed_bins + + script: + """ + if [ -n "${bins}" ] + then + for bin in ${bins}; do + if [[ \${bin} =~ ${meta.assembler}-${meta.binner}-${meta.id}.([_[:alnum:]]+).fa ]]; then + num=\${BASH_REMATCH[1]} + mv \${bin} ${meta.assembler}-${meta.binner}Refined-${meta.id}.\${num}.fa + else + echo "ERROR: the bin filename \${bin} does not match the expected format '${meta.assembler}-${meta.binner}-${meta.id}.([_[:alnum:]]+).fa'!" + exit 1 + fi + done + fi + """ +} diff --git a/modules/local/split_fasta.nf b/modules/local/split_fasta.nf new file mode 100644 index 00000000..4ea3b757 --- /dev/null +++ b/modules/local/split_fasta.nf @@ -0,0 +1,34 @@ +process SPLIT_FASTA { + tag "${meta.assembler}-${meta.binner}-${meta.id}" + label 'process_low' + + // Using container from metabat2 process, since this will be anyway already downloaded and contains biopython and pandas + conda "bioconda::metabat2=2.15 conda-forge::python=3.6.7 conda-forge::biopython=1.74 conda-forge::pandas=1.1.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' : + 'biocontainers/mulled-v2-e25d1fa2bb6cbacd47a4f8b2308bd01ba38c5dd7:75310f02364a762e6ba5206fcd11d7529534ed6e-0' }" + + input: + tuple val(meta), path(unbinned) + + output: + tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.[1-9]*.fa.gz") , optional:true, emit: unbinned + tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.pooled.fa.gz") , optional:true, emit: pooled + tuple val(meta), path("${meta.assembler}-${meta.binner}-${meta.id}.*.remaining.fa.gz"), optional:true, emit: remaining + path "versions.yml" , emit: versions + + script: + """ + # save unbinned contigs above thresholds into individual files, dump others in one file + split_fasta.py $unbinned ${params.min_length_unbinned_contigs} ${params.max_unbinned_contigs} ${params.min_contig_size} + + gzip *.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | sed 's/Python //g') + biopython: 1.7.4 + pandas: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('pandas').version)") + END_VERSIONS + """ +} diff --git a/modules/local/tiara_classify.nf b/modules/local/tiara_classify.nf new file mode 100644 index 00000000..8fde5241 --- /dev/null +++ b/modules/local/tiara_classify.nf @@ -0,0 +1,49 @@ +process TIARA_CLASSIFY { + tag "${meta.id}" + label "process_single" + + conda "conda-forge::r-tidyverse=1.3.1 conda-forge::r-optparse=1.7.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' : + 'biocontainers/mulled-v2-1021c2bc41756fa99bc402f461dad0d1c35358c1:b0c847e4fb89c343b04036e33b2daa19c4152cf5-0' }" + + input: + tuple val(meta), path(classification), path(contig2bin), path(bins) + + output: + tuple val(meta), path("eukarya/*.fa"), emit: eukarya_bins, optional: true + tuple val(meta), path("prokarya/*.fa"), emit: prokarya_bins, optional: true + tuple val(meta), path("bacteria/*.fa"), emit: bacteria_bins, optional: true + tuple val(meta), path("archaea/*.fa"), emit: archaea_bins, optional: true + tuple val(meta), path("organelle/*.fa"), emit: organelle_bins, optional: true + tuple val(meta), path("unknown/*.fa"), emit: unknown_bins, optional: true + tuple val(meta), path("*.binclassification.tsv"), emit: bin_classifications + path 'versions.yml', emit: versions + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + """ + domain_classification.R \ + --classification_file ${classification} \ + --contig_to_bin ${contig2bin} \ + ${args} \ + --output_prefix ${prefix} + + mkdir eukarya + mkdir prokarya + mkdir bacteria + mkdir archaea + mkdir organelle + mkdir unknown + + while IFS=\$"\t" read bin domain; do + find -L . -name "\${bin}*" -exec mv {} \${domain}/ \\; + done < bin2classification.tsv + + cat <<-END_VERSIONS > versions.yml + r-base: \$(R --version | head -n 1 | grep -Eo '[0-9.]+ ') + r-tidyverse: \$(cat tidyverse_version.txt) + END_VERSIONS + """ +} diff --git a/modules/nf-core/adapterremoval/main.nf b/modules/nf-core/adapterremoval/main.nf new file mode 100644 index 00000000..29aac1c0 --- /dev/null +++ b/modules/nf-core/adapterremoval/main.nf @@ -0,0 +1,92 @@ +process ADAPTERREMOVAL { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::adapterremoval=2.3.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : + 'biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + + input: + tuple val(meta), path(reads) + path(adapterlist) + + output: + tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated + tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.settings') , emit: settings + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def list = adapterlist ? "--adapter-list ${adapterlist}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + AdapterRemoval \\ + --file1 $reads \\ + $args \\ + $list \\ + --basename ${prefix} \\ + --threads ${task.cpus} \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } else { + """ + AdapterRemoval \\ + --file1 ${reads[0]} \\ + --file2 ${reads[1]} \\ + $args \\ + $list \\ + --basename ${prefix} \\ + --threads $task.cpus \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + ensure_fastq '${prefix}.pair1.truncated.gz' + ensure_fastq '${prefix}.pair2.truncated.gz' + ensure_fastq '${prefix}.collapsed.gz' + ensure_fastq '${prefix}.collapsed.truncated.gz' + ensure_fastq '${prefix}.paired.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/adapterremoval/meta.yml b/modules/nf-core/adapterremoval/meta.yml new file mode 100644 index 00000000..77273f60 --- /dev/null +++ b/modules/nf-core/adapterremoval/meta.yml @@ -0,0 +1,90 @@ +name: adapterremoval +description: Trim sequencing adapters and collapse overlapping reads +keywords: + - trimming + - adapters + - merging + - fastq +tools: + - adapterremoval: + description: The AdapterRemoval v2 tool for merging and clipping reads. + homepage: https://github.com/MikkelSchubert/adapterremoval + documentation: https://adapterremoval.readthedocs.io + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - adapterlist: + type: file + description: Optional text file containing list of adapters to look for for removal + with one adapter per line. Otherwise will look for default adapters (see + AdapterRemoval man page), or can be modified to remove user-specified + adapters via ext.args. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - singles_truncated: + type: file + description: | + Adapter trimmed FastQ files of either single-end reads, or singleton + 'orphaned' reads from merging of paired-end data (i.e., one of the pair + was lost due to filtering thresholds). + pattern: "*.truncated.fastq.gz" + - discarded: + type: file + description: | + Adapter trimmed FastQ files of reads that did not pass filtering + thresholds. + pattern: "*.discarded.fastq.gz" + - pair1_truncated: + type: file + description: | + Adapter trimmed R1 FastQ files of paired-end reads that did not merge + with their respective R2 pair due to long templates. The respective pair + is stored in 'pair2_truncated'. + pattern: "*.pair1.truncated.fastq.gz" + - pair2_truncated: + type: file + description: | + Adapter trimmed R2 FastQ files of paired-end reads that did not merge + with their respective R1 pair due to long templates. The respective pair + is stored in 'pair1_truncated'. + pattern: "*.pair2.truncated.fastq.gz" + - collapsed: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair but were not trimmed. + pattern: "*.collapsed.fastq.gz" + - collapsed_truncated: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair and were trimmed of adapter due to sufficient overlap. + pattern: "*.collapsed.truncated.fastq.gz" + - log: + type: file + description: AdapterRemoval log file + pattern: "*.settings" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/aria2/aria2.diff b/modules/nf-core/aria2/aria2.diff new file mode 100644 index 00000000..789fdb44 --- /dev/null +++ b/modules/nf-core/aria2/aria2.diff @@ -0,0 +1,24 @@ +Changes in module 'nf-core/aria2' +--- modules/nf-core/aria2/main.nf ++++ modules/nf-core/aria2/main.nf +@@ -12,7 +12,7 @@ + val source_url + + output: +- path ("$downloaded_file"), emit: downloaded_file ++ path ("checkm_data_2015_01_16/"), emit: downloaded_file + path "versions.yml" , emit: versions + + when: +@@ -30,6 +30,9 @@ + $args \\ + $source_url + ++ mkdir checkm_data_2015_01_16/ ++ tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz ++ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + +************************************************************ diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf new file mode 100644 index 00000000..b6091dad --- /dev/null +++ b/modules/nf-core/aria2/main.nf @@ -0,0 +1,41 @@ + +process ARIA2 { + tag "$source_url" + label 'process_single' + + conda "conda-forge::aria2=1.36.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : + 'biocontainers/aria2:1.36.0' }" + + input: + val source_url + + output: + path ("checkm_data_2015_01_16/"), emit: downloaded_file + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + downloaded_file = source_url.split("/")[-1] + + """ + set -e + + aria2c \\ + --check-certificate=false \\ + $args \\ + $source_url + + mkdir checkm_data_2015_01_16/ + tar x -C checkm_data_2015_01_16 -v -z -f *.tar.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/aria2/meta.yml b/modules/nf-core/aria2/meta.yml new file mode 100644 index 00000000..64c2a524 --- /dev/null +++ b/modules/nf-core/aria2/meta.yml @@ -0,0 +1,30 @@ +name: "aria2" +description: CLI Download utility +keywords: + - download +tools: + - "aria2": + description: "aria2 is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink." + + tool_dev_url: "https://github.com/aria2/aria2/" + + licence: "['GPL v2']" + +input: + - source_url: + type: url + description: Source URL to be downloaded + pattern: "{http,https}*" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - downloaded_file: + type: file + description: Downloaded files from source + pattern: "*.*" + +authors: + - "@JoseEspinosa" diff --git a/modules/nf-core/bbmap/bbnorm/main.nf b/modules/nf-core/bbmap/bbnorm/main.nf new file mode 100644 index 00000000..9974bfb4 --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/main.nf @@ -0,0 +1,42 @@ +process BBMAP_BBNORM { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bbmap=39.01 pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0': + 'biocontainers/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + input = meta.single_end ? "in=${fastq.join(',')}" : "in=${fastq[0]} in2=${fastq[1]}" + output = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.nm.fastq.gz out2=${prefix}_2.nm.fastq.gz" + + """ + bbnorm.sh \\ + $input \\ + $output \\ + $args \\ + threads=$task.cpus \\ + -Xmx${task.memory.toGiga()}g \\ + &> ${prefix}.bbnorm.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbnorm/meta.yml b/modules/nf-core/bbmap/bbnorm/meta.yml new file mode 100644 index 00000000..6c81bb41 --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/meta.yml @@ -0,0 +1,42 @@ +name: bbmap_bbnorm +description: BBNorm is designed to normalize coverage by down-sampling reads over high-depth areas of a genome, to result in a flat coverage distribution. +keywords: + - normalization + - assembly + - coverage +tools: + - bbmap: + description: "BBMap is a short read aligner, as well as various other bioinformatic tools." + homepage: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + documentation: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + tool_dev_url: "https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/" + licence: "BBMap - Bushnell B. - sourceforge.net/projects/bbmap/" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: fastq file + pattern: "*.{fastq,fq}(.gz)?" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: fastq file + pattern: "*.{fastq, fq}.gz" + +authors: + - "@danilodileo" diff --git a/modules/nf-core/bcftools/consensus/main.nf b/modules/nf-core/bcftools/consensus/main.nf new file mode 100644 index 00000000..2c5e8607 --- /dev/null +++ b/modules/nf-core/bcftools/consensus/main.nf @@ -0,0 +1,36 @@ +process BCFTOOLS_CONSENSUS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bcftools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf), path(tbi), path(fasta) + + output: + tuple val(meta), path('*.fa'), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + cat $fasta \\ + | bcftools \\ + consensus \\ + $vcf \\ + $args \\ + > ${prefix}.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/consensus/meta.yml b/modules/nf-core/bcftools/consensus/meta.yml new file mode 100644 index 00000000..05a93a56 --- /dev/null +++ b/modules/nf-core/bcftools/consensus/meta.yml @@ -0,0 +1,49 @@ +name: bcftools_consensus +description: Compresses VCF files +keywords: + - variant calling + - consensus + - VCF +tools: + - consensus: + description: | + Create consensus sequence by applying VCF variants to a reference fasta file. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file + pattern: "*.{vcf}" + - tbi: + type: file + description: tabix index file + pattern: "*.{tbi}" + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA reference consensus file + pattern: "*.{fasta,fa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bcftools/index/main.nf b/modules/nf-core/bcftools/index/main.nf new file mode 100644 index 00000000..43360aab --- /dev/null +++ b/modules/nf-core/bcftools/index/main.nf @@ -0,0 +1,37 @@ +process BCFTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::bcftools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*.csi"), optional:true, emit: csi + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + bcftools \\ + index \\ + $args \\ + --threads $task.cpus \\ + $vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/index/meta.yml b/modules/nf-core/bcftools/index/meta.yml new file mode 100644 index 00000000..b883fa5f --- /dev/null +++ b/modules/nf-core/bcftools/index/meta.yml @@ -0,0 +1,49 @@ +name: bcftools_index +description: Index VCF tools +keywords: + - vcf + - index + - bcftools + - csi + - tbi +tools: + - bcftools: + description: BCFtools is a set of utilities that manipulate variant calls in the Variant Call Format (VCF) and its binary counterpart BCF. All commands work transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed. Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatically even when streaming from a pipe. Indexed VCF and BCF will work in all situations. Un-indexed VCF and BCF and streams will work in most, but not all situations. + homepage: https://samtools.github.io/bcftools/ + documentation: https://samtools.github.io/bcftools/howtos/index.html + tool_dev_url: https://github.com/samtools/bcftools + doi: "10.1093/gigascience/giab008" + licence: ["MIT", "GPL-3.0-or-later"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - VCF: + type: file + description: VCF file (optionally GZIPPED) + pattern: "*.{vcf,vcf.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "versions.yml" + - csi: + type: file + description: Default VCF file index file + pattern: "*.csi" + - tbi: + type: file + description: Alternative VCF file index file for larger files (activated with -t parameter) + pattern: "*.tbi" + +authors: + - "@jfy133" diff --git a/modules/nf-core/bcftools/view/main.nf b/modules/nf-core/bcftools/view/main.nf new file mode 100644 index 00000000..86f807d3 --- /dev/null +++ b/modules/nf-core/bcftools/view/main.nf @@ -0,0 +1,55 @@ +process BCFTOOLS_VIEW { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bcftools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf), path(index) + path(regions) + path(targets) + path(samples) + + output: + tuple val(meta), path("*.gz") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + """ + bcftools view \\ + --output ${prefix}.vcf.gz \\ + ${regions_file} \\ + ${targets_file} \\ + ${samples_file} \\ + $args \\ + --threads $task.cpus \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/view/meta.yml b/modules/nf-core/bcftools/view/meta.yml new file mode 100644 index 00000000..326fd1fa --- /dev/null +++ b/modules/nf-core/bcftools/view/meta.yml @@ -0,0 +1,63 @@ +name: bcftools_view +description: View, subset and filter VCF or BCF files by position and filtering expression. Convert between VCF and BCF +keywords: + - variant calling + - view + - bcftools + - VCF + +tools: + - view: + description: | + View, subset and filter VCF or BCF files by position and filtering expression. Convert between VCF and BCF + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be inspected. + e.g. 'file.vcf' + - index: + type: file + description: | + The tab index for the VCF file to be inspected. + e.g. 'file.tbi' + - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. + e.g. 'file.vcf' + - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon index files) + e.g. 'file.vcf' + - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF normalized output file + pattern: "*.{vcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 00000000..5021e6fc --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 00000000..8a39e309 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,40 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/centrifuge/centrifuge/environment.yml b/modules/nf-core/centrifuge/centrifuge/environment.yml new file mode 100644 index 00000000..cf34dc0e --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/environment.yml @@ -0,0 +1,7 @@ +name: centrifuge_centrifuge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::centrifuge=1.0.4.1 diff --git a/modules/nf-core/centrifuge/centrifuge/main.nf b/modules/nf-core/centrifuge/centrifuge/main.nf new file mode 100644 index 00000000..d9a5653d --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/main.nf @@ -0,0 +1,91 @@ +process CENTRIFUGE_CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4.1--hdcf5f25_1' : + 'biocontainers/centrifuge:1.0.4.1--hdcf5f25_1' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*.{sam,tab}') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + """ + ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'` + + ## make a directory for placing the pipe files in somewhere other than default /tmp + ## otherwise get pipefile name clashes when multiple centrifuge runs on same node + ## use /tmp at the same time + mkdir ./temp + + centrifuge \\ + -x \$db_name \\ + --temp-directory ./temp \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + """ + touch ${prefix}.report.txt + touch ${prefix}.results.txt + touch ${prefix}.sam + echo | gzip -n > ${prefix}.unmapped.fastq.gz + echo | gzip -n > ${prefix}.mapped.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/centrifuge/meta.yml b/modules/nf-core/centrifuge/centrifuge/meta.yml new file mode 100644 index 00000000..a06104e1 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/meta.yml @@ -0,0 +1,75 @@ +name: centrifuge_centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Path to directory containing centrifuge database files + - save_unaligned: + type: boolean + description: If true unmapped fastq files are saved + - save_aligned: + type: boolean + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - sam: + type: file + description: | + Optional output file containing read alignments (SAM format )or a table of per-read hit information (TAB)s + pattern: "*.{sam,tab}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" +maintainers: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test new file mode 100644 index 00000000..d83b522a --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process CENTRIFUGE_CENTRIFUGE" + script "../main.nf" + process "CENTRIFUGE_CENTRIFUGE" + + tag "modules" + tag "modules_nfcore" + tag "centrifuge" + tag "centrifuge/centrifuge" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = db = [ [], file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/minigut_cf.tar.gz', checkIfExists: true) ] + """ + } + } + } + + test("sarscov2_fastq_se") { + + when { + process { + """ + input[0] = [ [id: 'test', single_end: true], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.report[0][1]).name, + file(process.out.results[0][1]).name, + file(process.out.fastq_mapped[0][1][0]).name, + file(process.out.fastq_unmapped[0][1][0]).name, + ).match() } + ) + } + + } + + test("sarscov2_fastq_pe") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.report[0][1]).name, + file(process.out.results[0][1]).name, + file(process.out.fastq_mapped[0][1][0]).name, + file(process.out.fastq_unmapped[0][1][0]).name, + ).match() } + ) + } + + } + + test("sarscov2_fastq_se_stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id: 'test'], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap new file mode 100644 index 00000000..f8a2ef7b --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap @@ -0,0 +1,125 @@ +{ + "sarscov2_fastq_se_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.report.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.results.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.mapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.unmapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + "versions.yml:md5,1ce028d9f968eca6df31586fe3b77c84" + ], + "fastq_mapped": [ + [ + { + "id": "test" + }, + "test.mapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "fastq_unmapped": [ + [ + { + "id": "test" + }, + "test.unmapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "report": [ + [ + { + "id": "test" + }, + "test.report.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "results": [ + [ + { + "id": "test" + }, + "test.results.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test" + }, + "test.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,1ce028d9f968eca6df31586fe3b77c84" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T07:47:36.886757827" + }, + "sarscov2_fastq_se": { + "content": [ + "test.report.txt", + "test.results.txt", + "", + "" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T08:22:31.470316024" + }, + "sarscov2_fastq_pe": { + "content": [ + "test.report.txt", + "test.results.txt", + "test.mapped.fastq.1.gz", + "test.unmapped.fastq.1.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T08:22:48.866073154" + } +} \ No newline at end of file diff --git a/modules/nf-core/centrifuge/centrifuge/tests/tags.yml b/modules/nf-core/centrifuge/centrifuge/tests/tags.yml new file mode 100644 index 00000000..53444cd2 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/tags.yml @@ -0,0 +1,2 @@ +centrifuge/centrifuge: + - "modules/nf-core/centrifuge/centrifuge/**" diff --git a/modules/nf-core/centrifuge/kreport/environment.yml b/modules/nf-core/centrifuge/kreport/environment.yml new file mode 100644 index 00000000..5c8fb451 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/environment.yml @@ -0,0 +1,7 @@ +name: centrifuge_kreport +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::centrifuge=1.0.4.1 diff --git a/modules/nf-core/centrifuge/kreport/main.nf b/modules/nf-core/centrifuge/kreport/main.nf new file mode 100644 index 00000000..25eb7167 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/main.nf @@ -0,0 +1,45 @@ +process CENTRIFUGE_KREPORT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4.1--hdcf5f25_1' : + 'biocontainers/centrifuge:1.0.4.1--hdcf5f25_1' }" + + input: + tuple val(meta), path(report) + path db + + output: + tuple val(meta), path('*.txt'), emit: kreport + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'` + centrifuge-kreport -x \$db_name ${report} > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/kreport/meta.yml b/modules/nf-core/centrifuge/kreport/meta.yml new file mode 100644 index 00000000..5641152b --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/meta.yml @@ -0,0 +1,51 @@ +name: "centrifuge_kreport" +description: Creates Kraken-style reports from centrifuge out files +keywords: + - classify + - metagenomics + - fastq + - db + - report + - kraken +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: File containing the centrifuge classification report + pattern: "*.{txt}" + - db: + type: directory + description: Path to directory containing centrifuge database files +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{txt}" +authors: + - "@sofstam" + - "@jfy133" +maintainers: + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/centrifuge/kreport/tests/main.nf.test b/modules/nf-core/centrifuge/kreport/tests/main.nf.test new file mode 100644 index 00000000..6347bd7c --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/main.nf.test @@ -0,0 +1,81 @@ +// nf-core modules test centrifuge/kreport +nextflow_process { + + name "Test Process CENTRIFUGE_KREPORT" + script "../main.nf" + process "CENTRIFUGE_KREPORT" + + tag "modules" + tag "modules_nfcore" + tag "centrifuge" + tag "centrifuge/centrifuge" + tag "centrifuge/kreport" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = db = [ [], file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/minigut_cf.tar.gz', checkIfExists: true) ] + """ + } + } + run("CENTRIFUGE_CENTRIFUGE") { + script "../../../centrifuge/centrifuge/main.nf" + process { + """ + input[0] = [ [id: 'test', single_end: true], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + } + + test("sarscov2_fastq_se") { + + when { + process { + """ + input[0] = CENTRIFUGE_CENTRIFUGE.out.results + input[1] = UNTAR.out.untar.map{it[1]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.kreport[0][1]).name, + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = CENTRIFUGE_CENTRIFUGE.out.results + input[1] = UNTAR.out.untar.map{it[1]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap b/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap new file mode 100644 index 00000000..4e0aaa79 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,43c766a19f2edf7e05d1a2a0b1816b13" + ], + "kreport": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,43c766a19f2edf7e05d1a2a0b1816b13" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T06:18:36.794405448" + }, + "sarscov2_fastq_se": { + "content": [ + "test.txt" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T06:28:20.461891873" + } +} \ No newline at end of file diff --git a/modules/nf-core/centrifuge/kreport/tests/tags.yml b/modules/nf-core/centrifuge/kreport/tests/tags.yml new file mode 100644 index 00000000..a3823d76 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/tags.yml @@ -0,0 +1,2 @@ +centrifuge/kreport: + - "modules/nf-core/centrifuge/kreport/**" diff --git a/modules/nf-core/checkm/lineagewf/environment.yml b/modules/nf-core/checkm/lineagewf/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/lineagewf/main.nf b/modules/nf-core/checkm/lineagewf/main.nf new file mode 100644 index 00000000..67fd8f35 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/main.nf @@ -0,0 +1,59 @@ +process CHECKM_LINEAGEWF { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" + + input: + tuple val(meta), path(fasta, stageAs: "input_bins/*") + val fasta_ext + path db + + output: + tuple val(meta), path("${prefix}") , emit: checkm_output + tuple val(meta), path("${prefix}/lineage.ms"), emit: marker_file + tuple val(meta), path("${prefix}.tsv") , emit: checkm_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def checkm_db = db ? "export CHECKM_DATA_PATH=${db}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + """ + ${checkm_db} + + checkm \\ + lineage_wf \\ + -t ${task.cpus} \\ + -f ${prefix}.tsv \\ + --tab_table \\ + --pplacer_threads ${task.cpus} \\ + -x ${fasta_ext} \\ + ${args} \\ + input_bins/ \\ + ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix}/ + touch ${prefix}/lineage.ms ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm/lineagewf/meta.yml b/modules/nf-core/checkm/lineagewf/meta.yml new file mode 100644 index 00000000..e32441d2 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/meta.yml @@ -0,0 +1,83 @@ +name: checkm_lineagewf +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. +keywords: + - checkm + - mag + - metagenome + - quality + - isolates + - microbes + - single cells + - completeness + - contamination + - bins + - genome bins +tools: + - checkm: + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. + homepage: https://ecogenomics.github.io/CheckM/ + documentation: https://github.com/Ecogenomics/CheckM/wiki + tool_dev_url: https://github.com/Ecogenomics/CheckM + doi: "10.1101/gr.186072.114" + licence: ["GPL v3"] + identifier: biotools:checkm +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: One or a list of multiple FASTA files of each bin, with extension + defined with the fasta_ext value + pattern: "*.{$fasta_ext}" + - - fasta_ext: + type: string + description: The file-type extension suffix of the input FASTA files (e.g., + fasta, fna, fa, fas) + - - db: + type: directory + description: Optional directory pointing to checkM database to prevent re-downloading +output: + - checkm_output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}: + type: directory + description: CheckM output directory + pattern: "*/" + - marker_file: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}/lineage.ms: + type: file + description: Lineage file + pattern: "*.ms" + - checkm_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'sample', bin:'1' ] + - ${prefix}.tsv: + type: file + description: CheckM summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test b/modules/nf-core/checkm/lineagewf/tests/main.nf.test new file mode 100644 index 00000000..8d60100e --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + name "Test Process CHECKM_LINEAGEWF" + script "../main.nf" + process "CHECKM_LINEAGEWF" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/lineagewf" + + test("checkm - lineage_wf") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert file(process.out.checkm_output[0][1]).list().find { file(it).name == "checkm.log" } + assert snapshot( + path(process.out.marker_file[0][1]).readLines().any{it.contains("PF00312.17")}, + process.out.checkm_tsv, + process.out.versions + ).match() + } + + } + + test("stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } +} diff --git a/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap new file mode 100644 index 00000000..6d6d7f75 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/main.nf.test.snap @@ -0,0 +1,99 @@ +{ + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ], + "checkm_output": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "checkm_tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "marker_file": [ + [ + { + "id": "test", + "single_end": false + }, + "lineage.ms:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:36:45.930077242" + }, + "checkm - lineage_wf": { + "content": [ + true, + [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,d5559764f563c4b55223e4e4a3dc1ec9" + ] + ], + [ + "versions.yml:md5,08f99a3a9677aba1509cda63dcf5ce71" + ] + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:27:36.491322471" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/lineagewf/tests/tags.yml b/modules/nf-core/checkm/lineagewf/tests/tags.yml new file mode 100644 index 00000000..04438be8 --- /dev/null +++ b/modules/nf-core/checkm/lineagewf/tests/tags.yml @@ -0,0 +1,2 @@ +checkm/lineagewf: + - modules/nf-core/checkm/lineagewf/** diff --git a/modules/nf-core/checkm/qa/environment.yml b/modules/nf-core/checkm/qa/environment.yml new file mode 100644 index 00000000..1b870502 --- /dev/null +++ b/modules/nf-core/checkm/qa/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm-genome=1.2.3 diff --git a/modules/nf-core/checkm/qa/main.nf b/modules/nf-core/checkm/qa/main.nf new file mode 100644 index 00000000..042f8b04 --- /dev/null +++ b/modules/nf-core/checkm/qa/main.nf @@ -0,0 +1,55 @@ +process CHECKM_QA { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm-genome:1.2.3--pyhdfd78af_1' : + 'biocontainers/checkm-genome:1.2.3--pyhdfd78af_1' }" + + input: + tuple val(meta), path(analysis_dir), path(marker_file), path(coverage_file) + path exclude_marker_file + + output: + tuple val(meta), path("${prefix}.txt") , optional: true, emit: output + tuple val(meta), path("${prefix}.fasta"), optional: true, emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.args?.matches(".*-o 9.*|.*--out_file 9.*") ? "fasta" : "txt" + def coverage = coverage_file && coverage_file.isFile() ? "--coverage_file ${coverage_file}" : "" + def exclude = exclude_marker_file && exclude_marker_file.isFile() ? "--exclude_markers ${exclude_marker_file}" : "" + """ + checkm \\ + qa \\ + --threads ${task.cpus} \\ + --file ${prefix}.${suffix} \\ + ${marker_file} \\ + ${analysis_dir} \\ + ${coverage} \\ + ${exclude} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm: \$( checkm 2>&1 | grep '...:::' | sed 's/.*CheckM v//;s/ .*//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm/qa/meta.yml b/modules/nf-core/checkm/qa/meta.yml new file mode 100644 index 00000000..cd41eaec --- /dev/null +++ b/modules/nf-core/checkm/qa/meta.yml @@ -0,0 +1,80 @@ +name: checkm_qa +description: CheckM provides a set of tools for assessing the quality of genomes recovered + from isolates, single cells, or metagenomes. +keywords: + - checkm + - mag + - metagenome + - quality + - isolates + - microbes + - single cells + - completeness + - contamination + - bins + - genome bins + - qa + - quality assurnce +tools: + - checkm: + description: Assess the quality of microbial genomes recovered from isolates, + single cells, and metagenomes. + homepage: https://ecogenomics.github.io/CheckM/ + documentation: https://github.com/Ecogenomics/CheckM/wiki + tool_dev_url: https://github.com/Ecogenomics/CheckM + doi: "10.1101/gr.186072.114" + licence: ["GPL v3"] + identifier: biotools:checkm +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - analysis_dir: + type: file + description: Directory containing output of checkm/analyze or checkm/lineage_wf + etc. + pattern: "*" + - marker_file: + type: file + description: Marker file specified during checkm/analyze or produced by checkm/{lineage,taxonomy}_wf + pattern: "*.ms" + - coverage_file: + type: file + description: File containing coverage of each sequence (generated by checkm + coverage) + - - exclude_marker_file: + type: file + description: File specifying markers to exclude from marker sets +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.txt: + type: file + description: "Default completeness statistics in various formats, as specified + with --out_format (excluding option: 9)" + pattern: "*.txt" + - fasta: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.fasta: + type: file + description: Output in fasta format (only if --out_format 9) + pattern: "*.fasta" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test b/modules/nf-core/checkm/qa/tests/main.nf.test new file mode 100644 index 00000000..8037bbc2 --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + name "Test Process CHECKM_QA" + script "../main.nf" + process "CHECKM_QA" + + tag "modules" + tag "modules_nfcore" + tag "checkm" + tag "checkm/qa" + tag "checkm/lineagewf" + + test("checkm - qa") { + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../lineagewf/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + } + + when { + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output.join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/main.nf.test.snap b/modules/nf-core/checkm/qa/tests/main.nf.test.snap new file mode 100644 index 00000000..77eca77b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/main.nf.test.snap @@ -0,0 +1,96 @@ +{ + "checkm - qa": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,645f4282569afb4b171396732b2d2582" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:44:09.849072843" + }, + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a0683a78cbf54a6a69ee64055c584a6" + ] + } + ], + "meta": { + "nf-test": "0.9.1", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-05T04:14:12.680834625" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm/qa/tests/tags.yml b/modules/nf-core/checkm/qa/tests/tags.yml new file mode 100644 index 00000000..08b4747b --- /dev/null +++ b/modules/nf-core/checkm/qa/tests/tags.yml @@ -0,0 +1,3 @@ +checkm/qa: + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/checkm2/databasedownload/environment.yml b/modules/nf-core/checkm2/databasedownload/environment.yml new file mode 100644 index 00000000..52d11ba9 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::aria2=1.36.0 diff --git a/modules/nf-core/checkm2/databasedownload/main.nf b/modules/nf-core/checkm2/databasedownload/main.nf new file mode 100644 index 00000000..6144067b --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/main.nf @@ -0,0 +1,55 @@ +import groovy.json.JsonSlurper + +process CHECKM2_DATABASEDOWNLOAD { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0': + 'biocontainers/aria2:1.36.0' }" + + input: + val(db_zenodo_id) + + output: + tuple val(meta), path("checkm2_db_v${db_version}.dmnd"), emit: database + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + zenodo_id = db_zenodo_id ?: 5571251 // Default to latest version if no ID provided + api_data = (new JsonSlurper()).parseText(file("https://zenodo.org/api/records/${zenodo_id}").text) + db_version = api_data.metadata.version + checksum = api_data.files[0].checksum.replaceFirst(/^md5:/, "md5=") + meta = [id: 'checkm2_db', version: db_version] + """ + # Automatic download is broken when using singularity/apptainer (https://github.com/chklovski/CheckM2/issues/73) + # So it's necessary to download the database manually + aria2c \ + ${args} \ + --checksum ${checksum} \ + https://zenodo.org/records/${zenodo_id}/files/checkm2_database.tar.gz + + tar -xzf checkm2_database.tar.gz + db_path=\$(find -name *.dmnd) + mv \$db_path checkm2_db_v${db_version}.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ + + stub: + """ + touch checkm_db.dmnd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/databasedownload/meta.yml b/modules/nf-core/checkm2/databasedownload/meta.yml new file mode 100644 index 00000000..632b4922 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/meta.yml @@ -0,0 +1,42 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_databasedownload" +description: CheckM2 database download +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - db_zenodo_id: + type: integer + description: Zenodo ID of the CheckM2 database to download + +output: + - database: + - meta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - checkm2_db_v${db_version}.dmnd: + type: file + description: CheckM2 database file + pattern: "checkm2_db_v*.dmnd" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test new file mode 100644 index 00000000..2a98f051 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + + name "Test Process CHECKM2_DATABASEDOWNLOAD" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/databasedownload/main.nf" + process "CHECKM2_DATABASEDOWNLOAD" + + test("Test CheckM2 Database Download") { + + when { + process { + """ + input[0] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap new file mode 100644 index 00000000..403d26fd --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "Test CheckM2 Database Download": { + "content": [ + [ + "versions.yml:md5,6201d5ac7aca6e32b98daf4f8656aa2a" + ] + ], + "timestamp": "2024-09-16T22:23:54.183040031" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/databasedownload/tests/tags.yml b/modules/nf-core/checkm2/databasedownload/tests/tags.yml new file mode 100644 index 00000000..46266770 --- /dev/null +++ b/modules/nf-core/checkm2/databasedownload/tests/tags.yml @@ -0,0 +1,2 @@ +checkm2/databasedownload: + - modules/nf-core/checkm2/databasedownload/** diff --git a/modules/nf-core/checkm2/predict/environment.yml b/modules/nf-core/checkm2/predict/environment.yml new file mode 100644 index 00000000..18fd1f51 --- /dev/null +++ b/modules/nf-core/checkm2/predict/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::checkm2=1.0.2 diff --git a/modules/nf-core/checkm2/predict/main.nf b/modules/nf-core/checkm2/predict/main.nf new file mode 100644 index 00000000..25271ba9 --- /dev/null +++ b/modules/nf-core/checkm2/predict/main.nf @@ -0,0 +1,52 @@ +process CHECKM2_PREDICT { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/checkm2:1.0.2--pyh7cba7a3_0': + 'biocontainers/checkm2:1.0.2--pyh7cba7a3_0' }" + + input: + tuple val(meta), path(fasta, stageAs: "input_bins/*") + tuple val(dbmeta), path(db) + + output: + tuple val(meta), path("${prefix}") , emit: checkm2_output + tuple val(meta), path("${prefix}/quality_report.tsv"), emit: checkm2_tsv + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + checkm2 \\ + predict \\ + --input ${fasta} \\ + --output-directory ${prefix} \\ + --threads ${task.cpus} \\ + --database_path ${db} \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix}/diamond_output ${prefix}/protein_files + touch ${prefix}/quality_report.tsv ${prefix}/checkm2.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + checkm2: \$(checkm2 --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/checkm2/predict/meta.yml b/modules/nf-core/checkm2/predict/meta.yml new file mode 100644 index 00000000..48cc9fbc --- /dev/null +++ b/modules/nf-core/checkm2/predict/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkm2_predict" +description: CheckM2 bin quality prediction +keywords: + - checkm + - mag + - metagenome + - quality + - completeness + - contamination + - bins +tools: + - "checkm2": + description: "CheckM2 - Rapid assessment of genome bin quality using machine learning" + homepage: "https://github.com/chklovski/CheckM2" + doi: "10.1038/s41592-023-01940-w" + licence: ["GPL v3"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - fasta: + type: file + description: One or multiple FASTA files of each bin + pattern: "*.{fasta,fna,fa}" + - - dbmeta: + type: map + description: | + Groovy Map containing database information + e.g. `[ id:'test', version:1 ]` + - db: + type: file + description: CheckM2 database +output: + - checkm2_output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}: + type: directory + description: CheckM2 output directory + pattern: "${prefix}/" + - checkm2_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - ${prefix}/quality_report.tsv: + type: file + description: CheckM2 summary completeness statistics table + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@dialvarezs" diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test b/modules/nf-core/checkm2/predict/tests/main.nf.test new file mode 100644 index 00000000..e825f74c --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test @@ -0,0 +1,46 @@ +nextflow_process { + + name "Test Process CHECKM2_PREDICT" + tag "modules_nfcore" + tag "modules" + tag "checkm2" + tag "checkm2/predict" + tag "checkm2/databasedownload" + script "modules/nf-core/checkm2/predict/main.nf" + process "CHECKM2_PREDICT" + + test("Test CheckM2 Predict") { + + setup { + run("CHECKM2_DATABASEDOWNLOAD") { + script "../../databasedownload/main.nf" + process { + """ + input[0] = [] + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ [id: 'test'], [file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true)] ] + input[1] = CHECKM2_DATABASEDOWNLOAD.out.database + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.checkm2_tsv, process.out.versions).match() } + ) + } + + } + +} diff --git a/modules/nf-core/checkm2/predict/tests/main.nf.test.snap b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap new file mode 100644 index 00000000..6fd2e918 --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/main.nf.test.snap @@ -0,0 +1,18 @@ +{ + "Test CheckM2 Predict": { + "content": [ + [ + [ + { + "id": "test" + }, + "quality_report.tsv:md5,7f05ff49d18697304575d1106a871501" + ] + ], + [ + "versions.yml:md5,088ec2d8a46efd530c11019328064bff" + ] + ], + "timestamp": "2024-09-16T22:43:50.787486798" + } +} \ No newline at end of file diff --git a/modules/nf-core/checkm2/predict/tests/tags.yml b/modules/nf-core/checkm2/predict/tests/tags.yml new file mode 100644 index 00000000..c31d112a --- /dev/null +++ b/modules/nf-core/checkm2/predict/tests/tags.yml @@ -0,0 +1,3 @@ +checkm2/predict: + - modules/nf-core/checkm2/predict/** + - modules/nf-core/checkm2/databasedownload/** diff --git a/modules/nf-core/chopper/environment.yml b/modules/nf-core/chopper/environment.yml new file mode 100644 index 00000000..2195b5ed --- /dev/null +++ b/modules/nf-core/chopper/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::chopper=0.9.0 diff --git a/modules/nf-core/chopper/main.nf b/modules/nf-core/chopper/main.nf new file mode 100644 index 00000000..6fc0b2d2 --- /dev/null +++ b/modules/nf-core/chopper/main.nf @@ -0,0 +1,56 @@ +process CHOPPER { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/chopper:0.9.0--hdcf5f25_0': + 'biocontainers/chopper:0.9.0--hdcf5f25_0' }" + + input: + tuple val(meta), path(fastq) + path fasta + + output: + tuple val(meta), path("*.fastq.gz") , emit: fastq + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def fasta_filtering = fasta ? "--contam ${fasta}" : "" + + if ("$fastq" == "${prefix}.fastq.gz") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + zcat \\ + $args \\ + $fastq | \\ + chopper \\ + --threads $task.cpus \\ + $fasta_filtering \\ + $args2 | \\ + gzip \\ + $args3 > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chopper: \$(chopper --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo | gzip > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + chopper: \$(chopper --version 2>&1 | cut -d ' ' -f 2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/chopper/meta.yml b/modules/nf-core/chopper/meta.yml new file mode 100644 index 00000000..049cf62d --- /dev/null +++ b/modules/nf-core/chopper/meta.yml @@ -0,0 +1,64 @@ +name: "chopper" +description: Filter and trim long read data. +keywords: + - filter + - trimming + - fastq + - nanopore + - qc +tools: + - "zcat": + description: "zcat uncompresses either a list of files on the command line or + its standard input and writes the uncompressed data on standard output." + documentation: "https://linux.die.net/man/1/zcat" + args_id: "$args" + identifier: "" + - "chopper": + description: "A rust command line for filtering and trimming long reads." + homepage: "https://github.com/wdecoster/chopper" + documentation: "https://github.com/wdecoster/chopper" + tool_dev_url: "https://github.com/wdecoster/chopper" + doi: "10.1093/bioinformatics/bty149" + licence: ["MIT"] + args_id: "$args2" + identifier: "" + - "gzip": + description: "Gzip reduces the size of the named files using Lempel-Ziv coding + (LZ77)." + documentation: "https://linux.die.net/man/1/gzip" + args_id: "$args3" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: FastQ with reads from long read sequencing e.g. PacBio or ONT + pattern: "*.{fastq.gz}" + - - fasta: + type: file + description: An optional reference fasta file against which to remove reads that align to it. + pattern: "*.fasta" +output: + - fastq: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Filtered and trimmed FastQ file + pattern: "*.{fastq.gz}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FynnFreyer" +maintainers: + - "@FynnFreyer" diff --git a/modules/nf-core/chopper/tests/main.nf.test b/modules/nf-core/chopper/tests/main.nf.test new file mode 100644 index 00000000..e611fa9f --- /dev/null +++ b/modules/nf-core/chopper/tests/main.nf.test @@ -0,0 +1,107 @@ +nextflow_process { + + name "Test Process CHOPPER" + script "../main.nf" + process "CHOPPER" + tag "chopper" + tag "modules" + tag "modules_nfcore" + + test("test with lambda reference") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test_2.fastq.gz', checkIfExists: true) + ] + input[1] = file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + """ + } + } + + then { + + def fastq_content = path(process.out.fastq.get(0).get(1)).linesGzip + + assertAll( + { assert process.success }, + // original pytest checks + { assert process.out.fastq.get(0).get(1) ==~ ".*/test_out.fastq.gz" }, + { assert !fastq_content.contains("@a52a642e-88d0-4584-babd-414ea84db484 runid=71c83ae0021f873e29b130c6562a4c27185f93b8 read=2768 ch=489 start_time=2021-08-11T12:07:39Z flow_cell_id=FAQ57606 protocol_group_id=210811_47CoV_SA sample_id=CS5 barcode=barcode04 barcode_alias=barcode04")}, + // additional nf-test checks + // Order of reads is not deterministic, so only assess whether the number of reads is correct + { assert snapshot( + fastq_content.size(), + process.out.versions + ).match() } + ) + } + } + + test("test without lambda reference") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + + def fastq_content = path(process.out.fastq.get(0).get(1)).linesGzip + + assertAll( + { assert process.success }, + // original pytest checks + { assert process.out.fastq.get(0).get(1) ==~ ".*/test_out.fastq.gz" }, + { assert fastq_content.contains("@2109d790-67ec-4fd1-8931-6c7e61908ff3 runid=97ca62ca093ff43533aa34c38a10b1d6325e7e7b read=52274 ch=243 start_time=2021-02-05T23:27:30Z flow_cell_id=FAP51364 protocol_group_id=data sample_id=RN20097 barcode=barcode01 barcode_alias=barcode01")}, + // additional nf-test checks + // Order of reads is not deterministic, so only assess whether the number of reads is correct + { assert snapshot( + fastq_content.size(), + process.out.versions + ).match() } + ) + } + } + + test("test-chopper-stub") { + options '-stub' + + when { + process { + """ + input[0] = [ + [id:'test_out' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + process.out.versions + ).match() } + ) + } + } +} \ No newline at end of file diff --git a/modules/nf-core/chopper/tests/main.nf.test.snap b/modules/nf-core/chopper/tests/main.nf.test.snap new file mode 100644 index 00000000..60522256 --- /dev/null +++ b/modules/nf-core/chopper/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "test without lambda reference": { + "content": [ + 400, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:14:36.37897815" + }, + "test with lambda reference": { + "content": [ + 15984, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:14:31.324993049" + }, + "test-chopper-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_out" + }, + "test_out.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ], + "fastq": [ + [ + { + "id": "test_out" + }, + "test_out.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + }, + [ + "versions.yml:md5,74a27493c09d0c481f6e52b517e12023" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-31T15:29:08.715579423" + } +} \ No newline at end of file diff --git a/modules/nf-core/chopper/tests/tags.yml b/modules/nf-core/chopper/tests/tags.yml new file mode 100644 index 00000000..89b6233b --- /dev/null +++ b/modules/nf-core/chopper/tests/tags.yml @@ -0,0 +1,2 @@ +chopper: + - modules/nf-core/chopper/** diff --git a/modules/nf-core/concoct/concoct/environment.yml b/modules/nf-core/concoct/concoct/environment.yml new file mode 100644 index 00000000..af1c59f0 --- /dev/null +++ b/modules/nf-core/concoct/concoct/environment.yml @@ -0,0 +1,7 @@ +name: concoct_concoct +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::concoct=1.1.0 diff --git a/modules/nf-core/concoct/concoct/main.nf b/modules/nf-core/concoct/concoct/main.nf new file mode 100644 index 00000000..958defd8 --- /dev/null +++ b/modules/nf-core/concoct/concoct/main.nf @@ -0,0 +1,59 @@ + +process CONCOCT_CONCOCT { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py312h245ed52_6': + 'biocontainers/concoct:1.1.0--py312h245ed52_6' }" + + input: + tuple val(meta), path(coverage_file), path(fasta) + + output: + tuple val(meta), path("*_args.txt") , emit: args_txt + tuple val(meta), path("*_clustering_gt1000.csv") , emit: clustering_csv + tuple val(meta), path("*_log.txt") , emit: log_txt + tuple val(meta), path("*_original_data_gt1000.csv") , emit: original_data_csv + tuple val(meta), path("*_PCA_components_data_gt1000.csv") , emit: pca_components_csv + tuple val(meta), path("*_PCA_transformed_data_gt1000.csv") , emit: pca_transformed_csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + concoct \\ + $args \\ + --threads ${task.cpus} \\ + --coverage_file ${coverage_file} \\ + --composition_file ${fasta} \\ + -b ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_args.txt + touch ${prefix}_clustering_gt1000.csv + touch ${prefix}_log.txt + touch ${prefix}_original_data_gt1000.csv + touch ${prefix}_PCA_components_data_gt1000.csv + touch ${prefix}_PCA_transformed_data_gt1000.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/concoct/concoct/meta.yml b/modules/nf-core/concoct/concoct/meta.yml new file mode 100644 index 00000000..d92455cf --- /dev/null +++ b/modules/nf-core/concoct/concoct/meta.yml @@ -0,0 +1,72 @@ +name: "concoct_concoct" +description: Unsupervised binning of metagenomic contigs by using nucleotide composition - kmer frequencies - and coverage data for multiple samples +keywords: + - contigs + - fragment + - mags + - binning + - concoct + - kmer + - nucleotide composition + - metagenomics + - bins +tools: + - "concoct": + description: "Clustering cONtigs with COverage and ComposiTion" + homepage: "https://concoct.readthedocs.io/en/latest/index.html" + documentation: "https://concoct.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/BinPro/CONCOCT" + doi: "10.1038/nmeth.3103" + licence: ["FreeBSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - coverage_file: + type: file + description: Subcontig coverage TSV table (typically generated with concoct_coverage_table.py) + pattern: "*.tsv" + - fasta: + type: file + description: FASTA file containing subcontigs (typically generated with cutup_fasta.py) + pattern: "*.fasta" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - args_txt: + type: file + description: File containing execution parameters + pattern: "*_args.txt" + - clustering_csv: + type: file + description: CSV containing information which subcontig is assigned to which cluster + pattern: "*_clustering_gt1000.csv" + - log_txt: + type: file + description: Log file of tool execution + pattern: "*_log.txt" + - original_data_csv: + type: file + description: Original CONCOCT GT1000 output + pattern: "*_original_data_gt1000.csv" + - pca_components_csv: + type: file + description: Untransformed PCA component values + pattern: "*_PCA_components_data_gt1000.csv" + - pca_transformed_csv: + type: file + description: Transformed PCA compontent values + pattern: "*_PCA_transformed_data_gt1000.csv" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/concoct/concoct/tests/main.nf.test b/modules/nf-core/concoct/concoct/tests/main.nf.test new file mode 100644 index 00000000..a979ebe3 --- /dev/null +++ b/modules/nf-core/concoct/concoct/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process CONCOCT_CONCOCT" + script "../main.nf" + process "CONCOCT_CONCOCT" + + tag "modules" + tag "modules_nfcore" + tag "concoct" + tag "concoct/concoct" + tag "concoct/cutupfasta" + tag "concoct/concoctcoveragetable" + + setup { + run("CONCOCT_CUTUPFASTA") { + script "../../cutupfasta/main.nf" + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = true + """ + } + } + + run("CONCOCT_CONCOCTCOVERAGETABLE") { + script "../../concoctcoveragetable/main.nf" + process { + """ + ch_bam_input = Channel + .fromList([ + [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ] + ] + ]) + + + input[0] = CONCOCT_CUTUPFASTA.out.bed.join( ch_bam_input ) + """ + } + } + } + + test("sarscov2 - bam") { + + when { + process { + """ + ch_input_for_concoctconcoct = CONCOCT_CONCOCTCOVERAGETABLE.out.tsv + .join(CONCOCT_CUTUPFASTA.out.fasta) + + input[0] = ch_input_for_concoctconcoct + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.args_txt, + process.out.clustering_csv, + process.out.original_data_csv, + process.out.pca_components_csv, + process.out.pca_transformed_csv, + process.out.versions, + file(process.out.log_txt[0][1]).readLines().last().contains("CONCOCT Finished, the log shows how it went.") + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id: 'test'], [], [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/concoct/concoct/tests/main.nf.test.snap b/modules/nf-core/concoct/concoct/tests/main.nf.test.snap new file mode 100644 index 00000000..6ff6ac97 --- /dev/null +++ b/modules/nf-core/concoct/concoct/tests/main.nf.test.snap @@ -0,0 +1,173 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test_args.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test_clustering_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test_log.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test_original_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test_PCA_components_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "5": [ + [ + { + "id": "test" + }, + "test_PCA_transformed_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,ed583fb0862e060844ceffee6bf0eeb7" + ], + "args_txt": [ + [ + { + "id": "test" + }, + "test_args.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "clustering_csv": [ + [ + { + "id": "test" + }, + "test_clustering_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log_txt": [ + [ + { + "id": "test" + }, + "test_log.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "original_data_csv": [ + [ + { + "id": "test" + }, + "test_original_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "pca_components_csv": [ + [ + { + "id": "test" + }, + "test_PCA_components_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "pca_transformed_csv": [ + [ + { + "id": "test" + }, + "test_PCA_transformed_data_gt1000.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ed583fb0862e060844ceffee6bf0eeb7" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-24T08:39:53.759611572" + }, + "sarscov2 - bam": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test_args.txt:md5,277d8a15f732ae1bd4d6839fd768113e" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test_clustering_gt1000.csv:md5,8cb3e6901075bf07966d08e1816762ce" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test_original_data_gt1000.csv:md5,7474c6a670a608c2c9c9b9edde724074" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test_PCA_components_data_gt1000.csv:md5,e69fd8a810563e62cb0c3998842b3659" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test_PCA_transformed_data_gt1000.csv:md5,558a35f720a9bfa54cbf7cdf099fd367" + ] + ], + [ + "versions.yml:md5,ed583fb0862e060844ceffee6bf0eeb7" + ], + true + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T20:25:56.619830843" + } +} \ No newline at end of file diff --git a/modules/nf-core/concoct/concoct/tests/tags.yml b/modules/nf-core/concoct/concoct/tests/tags.yml new file mode 100644 index 00000000..90503318 --- /dev/null +++ b/modules/nf-core/concoct/concoct/tests/tags.yml @@ -0,0 +1,2 @@ +concoct/concoct: + - "modules/nf-core/concoct/concoct/**" diff --git a/modules/nf-core/concoct/concoctcoveragetable/environment.yml b/modules/nf-core/concoct/concoctcoveragetable/environment.yml new file mode 100644 index 00000000..4152605f --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/environment.yml @@ -0,0 +1,7 @@ +name: concoct_concoctcoveragetable +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::concoct=1.1.0 diff --git a/modules/nf-core/concoct/concoctcoveragetable/main.nf b/modules/nf-core/concoct/concoctcoveragetable/main.nf new file mode 100644 index 00000000..65a81454 --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/main.nf @@ -0,0 +1,48 @@ + +process CONCOCT_CONCOCTCOVERAGETABLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py312h245ed52_6': + 'biocontainers/concoct:1.1.0--py312h245ed52_6' }" + + input: + tuple val(meta), path(bed), path(bamfiles), path(baifiles) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + concoct_coverage_table.py \\ + $args \\ + $bed \\ + $bamfiles \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/concoct/concoctcoveragetable/meta.yml b/modules/nf-core/concoct/concoctcoveragetable/meta.yml new file mode 100644 index 00000000..26b9793c --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/meta.yml @@ -0,0 +1,55 @@ +name: "concoct_concoctcoveragetable" +description: Generate the input coverage table for CONCOCT using a BEDFile +keywords: + - contigs + - fragment + - mags + - binning + - bed + - bam + - subcontigs + - coverage +tools: + - "concoct": + description: "Clustering cONtigs with COverage and ComposiTion" + homepage: "https://concoct.readthedocs.io/en/latest/index.html" + documentation: "https://concoct.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/BinPro/CONCOCT" + doi: "10.1038/nmeth.3103" + licence: ["FreeBSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: BED file describing where each contig was cut up (typically output from CONCOCT's cut_up_fasta.py) + pattern: "*.bed" + - bamfiles: + type: file + description: A single or list of BAM files of reads mapped back to original contigs (prior cutting up) + pattern: "*.bam" + - baifiles: + type: file + description: A single or list of BAM index files (.bai) corresponding to BAM + pattern: "*.bam" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tsv: + type: file + description: Contig coverage table + pattern: "*.tsv" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test b/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test new file mode 100644 index 00000000..dc378b52 --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test @@ -0,0 +1,84 @@ +nextflow_process { + + name "Test Process CONCOCT_CONCOCTCOVERAGETABLE" + script "../main.nf" + process "CONCOCT_CONCOCTCOVERAGETABLE" + + tag "modules" + tag "modules_nfcore" + tag "concoct" + tag "concoct/concoctcoveragetable" + tag "concoct/cutupfasta" + + setup { + run("CONCOCT_CUTUPFASTA") { + script "../../cutupfasta/main.nf" + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = true + """ + } + } + } + + test("sarscov2 - bam") { + + when { + process { + """ + ch_bam_input = Channel + .fromList([ + [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ] + ] + ]) + + + input[0] = CONCOCT_CUTUPFASTA.out.bed.join( ch_bam_input ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:'test'], [], [] ,[] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test.snap b/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test.snap new file mode 100644 index 00000000..e92af4b2 --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,156a9a23e05b34188d03d52b39fae343" + ], + "tsv": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,156a9a23e05b34188d03d52b39fae343" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-24T08:41:27.166202038" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,0183c95ec03d2a6c83a4ca82935308a4" + ] + ], + "1": [ + "versions.yml:md5,156a9a23e05b34188d03d52b39fae343" + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,0183c95ec03d2a6c83a4ca82935308a4" + ] + ], + "versions": [ + "versions.yml:md5,156a9a23e05b34188d03d52b39fae343" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T15:59:09.328659542" + } +} \ No newline at end of file diff --git a/modules/nf-core/concoct/concoctcoveragetable/tests/tags.yml b/modules/nf-core/concoct/concoctcoveragetable/tests/tags.yml new file mode 100644 index 00000000..20480896 --- /dev/null +++ b/modules/nf-core/concoct/concoctcoveragetable/tests/tags.yml @@ -0,0 +1,2 @@ +concoct/concoctcoveragetable: + - "modules/nf-core/concoct/concoctcoveragetable/**" diff --git a/modules/nf-core/concoct/cutupfasta/environment.yml b/modules/nf-core/concoct/cutupfasta/environment.yml new file mode 100644 index 00000000..c6927197 --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/environment.yml @@ -0,0 +1,7 @@ +name: concoct_cutupfasta +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::concoct=1.1.0 diff --git a/modules/nf-core/concoct/cutupfasta/main.nf b/modules/nf-core/concoct/cutupfasta/main.nf new file mode 100644 index 00000000..9a69e72c --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/main.nf @@ -0,0 +1,54 @@ + +process CONCOCT_CUTUPFASTA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py312h245ed52_6': + 'biocontainers/concoct:1.1.0--py312h245ed52_6' }" + + input: + tuple val(meta), path(fasta) + val(bed) + + output: + tuple val(meta), path("*.fasta"), emit: fasta + tuple val(meta), path("*.bed") , optional: true, emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bedfile = bed ? "-b ${prefix}.bed" : "" + if ("$fasta" == "${prefix}.fasta") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + cut_up_fasta.py \\ + $fasta \\ + $args \\ + $bedfile \\ + > ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bedfile = bed ? "-b ${prefix}.bed" : "" + if ("$fasta" == "${prefix}.fasta") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + touch ${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/concoct/cutupfasta/meta.yml b/modules/nf-core/concoct/cutupfasta/meta.yml new file mode 100644 index 00000000..44a28723 --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/meta.yml @@ -0,0 +1,53 @@ +name: "concoct_cutupfasta" +description: Cut up fasta file in non-overlapping or overlapping parts of equal length. +keywords: + - contigs + - fragment + - mags + - binning + - fasta + - cut + - cut up +tools: + - "concoct": + description: "Clustering cONtigs with COverage and ComposiTion" + homepage: "https://concoct.readthedocs.io/en/latest/index.html" + documentation: "https://concoct.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/BinPro/CONCOCT" + doi: "10.1038/nmeth.3103" + licence: ["FreeBSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: (Uncompressed) FASTA file containing contigs + pattern: "*.{fasta,fna,fa,fas}" + - bed: + type: boolean + description: Specify whether to generate a BED file describing where each contig was cut up +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Cut up fasta file in non-overlapping or overlapping parts of equal length. + pattern: "*.fasta" + - bed: + type: file + description: Optional BED File containing locations on original contigs where they were cut up. + pattern: "*.bed" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/concoct/cutupfasta/tests/main.nf.test b/modules/nf-core/concoct/cutupfasta/tests/main.nf.test new file mode 100644 index 00000000..0c395ee2 --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/tests/main.nf.test @@ -0,0 +1,60 @@ +nextflow_process { + + name "Test Process CONCOCT_CUTUPFASTA" + script "../main.nf" + process "CONCOCT_CUTUPFASTA" + + tag "modules" + tag "modules_nfcore" + tag "concoct" + tag "concoct/cutupfasta" + + test("sarscov2 - genome - fasta") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - genome - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/concoct/cutupfasta/tests/main.nf.test.snap b/modules/nf-core/concoct/cutupfasta/tests/main.nf.test.snap new file mode 100644 index 00000000..74be0e7a --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/tests/main.nf.test.snap @@ -0,0 +1,84 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,b7d972e6efa0b306e02e46a685310c7f" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9785f4f0f5f6e8d020a52a9e2c4fa3e0" + ], + "bed": [ + + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,b7d972e6efa0b306e02e46a685310c7f" + ] + ], + "versions": [ + "versions.yml:md5,9785f4f0f5f6e8d020a52a9e2c4fa3e0" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T15:13:08.126117371" + }, + "sarscov2 - genome - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9785f4f0f5f6e8d020a52a9e2c4fa3e0" + ], + "bed": [ + + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,9785f4f0f5f6e8d020a52a9e2c4fa3e0" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T15:13:20.929503446" + } +} \ No newline at end of file diff --git a/modules/nf-core/concoct/cutupfasta/tests/tags.yml b/modules/nf-core/concoct/cutupfasta/tests/tags.yml new file mode 100644 index 00000000..84a0432c --- /dev/null +++ b/modules/nf-core/concoct/cutupfasta/tests/tags.yml @@ -0,0 +1,2 @@ +concoct/cutupfasta: + - "modules/nf-core/concoct/cutupfasta/**" diff --git a/modules/nf-core/concoct/extractfastabins/environment.yml b/modules/nf-core/concoct/extractfastabins/environment.yml new file mode 100644 index 00000000..e31d2e86 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/environment.yml @@ -0,0 +1,7 @@ +name: concoct_extractfastabins +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::concoct=1.1.0 diff --git a/modules/nf-core/concoct/extractfastabins/main.nf b/modules/nf-core/concoct/extractfastabins/main.nf new file mode 100644 index 00000000..add94c93 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/main.nf @@ -0,0 +1,56 @@ +process CONCOCT_EXTRACTFASTABINS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py312h245ed52_6': + 'biocontainers/concoct:1.1.0--py312h245ed52_6' }" + + input: + tuple val(meta), path(original_fasta), path(csv) + + output: + tuple val(meta), path("${prefix}/*.fa.gz"), emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir ${prefix} + + extract_fasta_bins.py \\ + $args \\ + $original_fasta \\ + $csv \\ + --output_path ${prefix} + + ## Add prefix to each file to disambiguate one sample's 1.fa, 2.fa from sample2 + for i in ${prefix}/*.fa; do + mv \${i} \${i/\\///${prefix}_} + gzip \${i/\\///${prefix}_} + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p ${prefix} + echo "" | gzip > ${prefix}/${prefix}.fa.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/concoct/extractfastabins/meta.yml b/modules/nf-core/concoct/extractfastabins/meta.yml new file mode 100644 index 00000000..a9ac00a3 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/meta.yml @@ -0,0 +1,52 @@ +name: "concoct_extractfastabins" +description: Creates a FASTA file for each new cluster assigned by CONCOCT +keywords: + - contigs + - fragment + - mags + - binning + - fasta + - cut + - cut up + - bins + - merge +tools: + - "concoct": + description: "Clustering cONtigs with COverage and ComposiTion" + homepage: "https://concoct.readthedocs.io/en/latest/index.html" + documentation: "https://concoct.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/BinPro/CONCOCT" + doi: "10.1038/nmeth.3103" + licence: ["FreeBSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - original_fasta: + type: file + description: Original input FASTA file to CONOCT cut_up_fasta + pattern: "*.{fasta,fna,fa,fas}" + - csv: + type: boolean + description: Output table of merge_cutup_clustering with new cluster assignments + pattern: ".csv" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: FASTA files containing CONCOCT predicted bin clusters, named numerically by CONCOCT cluster ID in a directory called `fasta_bins` + pattern: "*.fa.gz" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/concoct/extractfastabins/tests/main.nf.test b/modules/nf-core/concoct/extractfastabins/tests/main.nf.test new file mode 100644 index 00000000..ce14d726 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/tests/main.nf.test @@ -0,0 +1,124 @@ +nextflow_process { + + name "Test Process CONCOCT_EXTRACTFASTABINS" + script "../main.nf" + process "CONCOCT_EXTRACTFASTABINS" + tag "modules" + tag "modules_nfcore" + tag "concoct" + tag "concoct/extractfastabins" + tag "concoct/cutupfasta" + tag "concoct/concoctcoveragetable" + tag "concoct/concoct" + tag "concoct/mergecutupclustering" + + setup { + run("CONCOCT_CUTUPFASTA") { + script "../../cutupfasta/main.nf" + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = true + """ + } + } + + run("CONCOCT_CONCOCTCOVERAGETABLE") { + script "../../concoctcoveragetable/main.nf" + process { + """ + ch_bam_input = Channel + .fromList([ + [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ] + ] + ]) + + + input[0] = CONCOCT_CUTUPFASTA.out.bed.join( ch_bam_input ) + """ + } + } + + run("CONCOCT_CONCOCT") { + script "../../concoct/main.nf" + process { + """ + ch_input_for_concoctconcoct = CONCOCT_CONCOCTCOVERAGETABLE.out.tsv + .join(CONCOCT_CUTUPFASTA.out.fasta) + + input[0] = ch_input_for_concoctconcoct + """ + } + } + + run("CONCOCT_MERGECUTUPCLUSTERING") { + script "../../mergecutupclustering/main.nf" + process { + """ + input[0] = CONCOCT_CONCOCT.out.clustering_csv + """ + } + } + } + + test("sarscov2 - bam") { + + when { + process { + """ + fasta = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[0] = Channel.of(fasta).join(CONCOCT_MERGECUTUPCLUSTERING.out.csv) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/concoct/extractfastabins/tests/main.nf.test.snap b/modules/nf-core/concoct/extractfastabins/tests/main.nf.test.snap new file mode 100644 index 00000000..5af33a37 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,768f307345c2552b0613b946fca44873" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,768f307345c2552b0613b946fca44873" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-24T08:33:53.38186154" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + "versions.yml:md5,768f307345c2552b0613b946fca44873" + ], + "fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "versions": [ + "versions.yml:md5,768f307345c2552b0613b946fca44873" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T21:05:15.805714717" + } +} \ No newline at end of file diff --git a/modules/nf-core/concoct/extractfastabins/tests/tags.yml b/modules/nf-core/concoct/extractfastabins/tests/tags.yml new file mode 100644 index 00000000..6db649a2 --- /dev/null +++ b/modules/nf-core/concoct/extractfastabins/tests/tags.yml @@ -0,0 +1,2 @@ +concoct/extractfastabins: + - "modules/nf-core/concoct/extractfastabins/**" diff --git a/modules/nf-core/concoct/mergecutupclustering/environment.yml b/modules/nf-core/concoct/mergecutupclustering/environment.yml new file mode 100644 index 00000000..459d65c7 --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/environment.yml @@ -0,0 +1,7 @@ +name: concoct_mergecutupclustering +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::concoct=1.1.0 diff --git a/modules/nf-core/concoct/mergecutupclustering/main.nf b/modules/nf-core/concoct/mergecutupclustering/main.nf new file mode 100644 index 00000000..01e2e46d --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/main.nf @@ -0,0 +1,48 @@ +process CONCOCT_MERGECUTUPCLUSTERING { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/concoct:1.1.0--py312h245ed52_6': + 'biocontainers/concoct:1.1.0--py312h245ed52_6' }" + + input: + tuple val(meta), path(clustering_csv) + + output: + tuple val(meta), path("*.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$clustering_csv" == "${prefix}.csv") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + merge_cutup_clustering.py \\ + $args \\ + $clustering_csv \\ + > ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$clustering_csv" == "${prefix}.csv") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + touch ${prefix}.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + concoct: \$(echo \$(concoct --version 2>&1) | sed 's/concoct //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/concoct/mergecutupclustering/meta.yml b/modules/nf-core/concoct/mergecutupclustering/meta.yml new file mode 100644 index 00000000..e34a423c --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/meta.yml @@ -0,0 +1,47 @@ +name: "concoct_mergecutupclustering" +description: Merge consecutive parts of the original contigs original cut up by cut_up_fasta.py +keywords: + - contigs + - fragment + - mags + - binning + - fasta + - cut + - cut up + - merge +tools: + - "concoct": + description: "Clustering cONtigs with COverage and ComposiTion" + homepage: "https://concoct.readthedocs.io/en/latest/index.html" + documentation: "https://concoct.readthedocs.io/en/latest/index.html" + tool_dev_url: "https://github.com/BinPro/CONCOCT" + doi: "10.1038/nmeth.3103" + licence: ["FreeBSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - clustering_csv: + type: file + description: Input cutup clustering result. Typically *_gt1000.csv from concoct + pattern: "*.csv" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csv: + type: file + description: Cluster assignments per contig part with concensus cluster + pattern: "*.csv" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test b/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test new file mode 100644 index 00000000..b888c2fa --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test @@ -0,0 +1,107 @@ +nextflow_process { + + name "Test Process CONCOCT_MERGECUTUPCLUSTERING" + script "../main.nf" + process "CONCOCT_MERGECUTUPCLUSTERING" + + tag "modules" + tag "modules_nfcore" + tag "concoct" + tag "concoct/mergecutupclustering" + tag "concoct/cutupfasta" + tag "concoct/concoctcoveragetable" + tag "concoct/concoct" + + setup { + run("CONCOCT_CUTUPFASTA") { + script "../../cutupfasta/main.nf" + process { + """ + input[0] = [ + [id: 'test', single_end: false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = true + """ + } + } + + run("CONCOCT_CONCOCTCOVERAGETABLE") { + script "../../concoctcoveragetable/main.nf" + process { + """ + ch_bam_input = Channel + .fromList([ + [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + '/genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ] + ] + ]) + + + input[0] = CONCOCT_CUTUPFASTA.out.bed.join( ch_bam_input ) + """ + } + } + + run("CONCOCT_CONCOCT") { + script "../../concoct/main.nf" + process { + """ + ch_input_for_concoctconcoct = CONCOCT_CONCOCTCOVERAGETABLE.out.tsv + .join(CONCOCT_CUTUPFASTA.out.fasta) + + input[0] = ch_input_for_concoctconcoct + """ + } + } + } + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = CONCOCT_CONCOCT.out.clustering_csv + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id: 'test'], []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test.snap b/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test.snap new file mode 100644 index 00000000..fd4e6813 --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,bf584483f6d21a4c6b34a4c517c88283" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,bf584483f6d21a4c6b34a4c517c88283" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-24T08:41:05.313757446" + }, + "sarscov2 - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ] + ], + "1": [ + "versions.yml:md5,bf584483f6d21a4c6b34a4c517c88283" + ], + "csv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ] + ], + "versions": [ + "versions.yml:md5,bf584483f6d21a4c6b34a4c517c88283" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-21T20:35:39.630679488" + } +} \ No newline at end of file diff --git a/modules/nf-core/concoct/mergecutupclustering/tests/tags.yml b/modules/nf-core/concoct/mergecutupclustering/tests/tags.yml new file mode 100644 index 00000000..0fc6dc6e --- /dev/null +++ b/modules/nf-core/concoct/mergecutupclustering/tests/tags.yml @@ -0,0 +1,2 @@ +concoct/mergecutupclustering: + - "modules/nf-core/concoct/mergecutupclustering/**" diff --git a/modules/nf-core/dastool/dastool/main.nf b/modules/nf-core/dastool/dastool/main.nf new file mode 100644 index 00000000..8440edc7 --- /dev/null +++ b/modules/nf-core/dastool/dastool/main.nf @@ -0,0 +1,62 @@ +process DASTOOL_DASTOOL { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::das_tool=1.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/das_tool:1.1.6--r42hdfd78af_0' : + 'biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" + + input: + tuple val(meta), path(contigs), path(bins) + path(proteins) + path(db_directory) + + output: + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*_summary.tsv") , optional: true, emit: summary + tuple val(meta), path("*_DASTool_contig2bin.tsv") , optional: true, emit: contig2bin + tuple val(meta), path("*.eval") , optional: true, emit: eval + tuple val(meta), path("*_DASTool_bins/*.fa") , optional: true, emit: bins + tuple val(meta), path("*.pdf") , optional: true, emit: pdfs + tuple val(meta), path("*.candidates.faa") , optional: true, emit: fasta_proteins + tuple val(meta), path("*.faa") , optional: true, emit: candidates_faa + tuple val(meta), path("*.archaea.scg") , optional: true, emit: fasta_archaea_scg + tuple val(meta), path("*.bacteria.scg") , optional: true, emit: fasta_bacteria_scg + tuple val(meta), path("*.b6") , optional: true, emit: b6 + tuple val(meta), path("*.seqlength") , optional: true, emit: seqlength + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bin_list = bins instanceof List ? bins.join(",") : "$bins" + def db_dir = db_directory ? "--db_directory $db_directory" : "" + def clean_contigs = contigs.toString() - ".gz" + def decompress_contigs = contigs.toString() == clean_contigs ? "" : "gunzip -q -f $contigs" + def clean_proteins = proteins ? proteins.toString() - ".gz" : "" + def decompress_proteins = proteins ? "gunzip -f $proteins" : "" + def proteins_pred = proteins ? "-p $clean_proteins" : "" + + """ + $decompress_proteins + $decompress_contigs + + DAS_Tool \\ + $args \\ + $proteins_pred \\ + $db_dir \\ + -t $task.cpus \\ + -i $bin_list \\ + -c $clean_contigs \\ + -o $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/dastool/dastool/meta.yml b/modules/nf-core/dastool/dastool/meta.yml new file mode 100644 index 00000000..1d4ffa8b --- /dev/null +++ b/modules/nf-core/dastool/dastool/meta.yml @@ -0,0 +1,104 @@ +name: dastool_dastool +description: DAS Tool binning step. +keywords: + - binning + - das tool + - table + - de novo + - bins + - contigs + - assembly + - das_tool +tools: + - dastool: + description: | + DAS Tool is an automated method that integrates the results + of a flexible number of binning algorithms to calculate an optimized, non-redundant + set of bins from a single assembly. + + homepage: https://github.com/cmks/DAS_Tool + documentation: https://github.com/cmks/DAS_Tool + tool_dev_url: https://github.com/cmks/DAS_Tool + doi: "10.1038/s41564-018-0171-1" + licence: ["BSD"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - contigs: + type: file + description: fasta file + pattern: "*.{fa.gz,fas.gz,fasta.gz}" + - bins: + type: file + description: "FastaToContig2Bin tabular file generated with dastool/fastatocontig2bin" + pattern: "*.tsv" + - proteins: + type: file + description: Predicted proteins in prodigal fasta format (>scaffoldID_geneNo) + pattern: "*.{fa.gz,fas.gz,fasta.gz}" + - db_directory: + type: file + description: (optional) Directory of single copy gene database. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "versions.yml" + - log: + type: file + description: Log file of the run + pattern: "*.log" + - summary: + type: file + description: Summary of output bins including quality and completeness estimates + pattern: "*summary.txt" + - contig2bin: + type: file + description: Scaffolds to bin file of output bins + pattern: "*.contig2bin.txt" + - eval: + type: file + description: Quality and completeness estimates of input bin sets + pattern: "*.eval" + - bins: + type: file + description: Final refined bins in fasta format + pattern: "*.fa" + - pdfs: + type: file + description: Plots showing the amount of high quality bins and score distribution of bins per method + pattern: "*.pdf" + - fasta_proteins: + type: file + description: Output from prodigal if not already supplied + pattern: "*.proteins.faa" + - fasta_archaea_scg: + type: file + description: Results of archaeal single-copy-gene prediction + pattern: "*.archaea.scg" + - fasta_bacteria_scg: + type: file + description: Results of bacterial single-copy-gene prediction + pattern: "*.bacteria.scg" + - b6: + type: file + description: Results in b6 format + pattern: "*.b6" + - seqlength: + type: file + description: Summary of contig lengths + pattern: "*.seqlength" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/dastool/fastatocontig2bin/main.nf b/modules/nf-core/dastool/fastatocontig2bin/main.nf new file mode 100644 index 00000000..f4f77c0f --- /dev/null +++ b/modules/nf-core/dastool/fastatocontig2bin/main.nf @@ -0,0 +1,41 @@ +process DASTOOL_FASTATOCONTIG2BIN { + tag "$meta.id" + label 'process_single' + + conda "bioconda::das_tool=1.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/das_tool:1.1.6--r42hdfd78af_0' : + 'biocontainers/das_tool:1.1.6--r42hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + val(extension) + + output: + tuple val(meta), path("*.tsv"), emit: fastatocontig2bin + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def file_extension = extension ? extension : "fasta" + def clean_fasta = fasta.toString() - ".gz" + def decompress_fasta = fasta.toString() == clean_fasta ? "" : "gunzip -q -f $fasta" + """ + $decompress_fasta + + Fasta_to_Contig2Bin.sh \\ + $args \\ + -i . \\ + -e $file_extension \\ + > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dastool: \$( DAS_Tool --version 2>&1 | grep "DAS Tool" | sed 's/DAS Tool //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/dastool/fastatocontig2bin/meta.yml b/modules/nf-core/dastool/fastatocontig2bin/meta.yml new file mode 100644 index 00000000..1176ae96 --- /dev/null +++ b/modules/nf-core/dastool/fastatocontig2bin/meta.yml @@ -0,0 +1,56 @@ +name: dastool_fastatocontig2bin +description: Helper script to convert a set of bins in fasta format to tabular scaffolds2bin format +keywords: + - binning + - das tool + - table + - de novo + - bins + - contigs + - assembly + - das_tool +tools: + - dastool: + description: | + DAS Tool is an automated method that integrates the results + of a flexible number of binning algorithms to calculate an optimized, non-redundant + set of bins from a single assembly. + + homepage: https://github.com/cmks/DAS_Tool + documentation: https://github.com/cmks/DAS_Tool + tool_dev_url: https://github.com/cmks/DAS_Tool + doi: "10.1038/s41564-018-0171-1" + licence: ["BSD"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta of list of fasta files recommended to be gathered via with .collect() of bins + pattern: "*.{fa,fa.gz,fas,fas.gz,fna,fna.gz,fasta,fasta.gz}" + - extension: + type: val + description: Fasta file extension (fa | fas | fasta | ...), without .gz suffix, if gzipped input. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastatocontig2bin: + type: file + description: tabular contig2bin file for DAS tool input + pattern: "*.tsv" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 00000000..831b7f12 --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,102 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::fastp=0.23.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 00000000..197ea7ca --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,73 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/filtlong/environment.yml b/modules/nf-core/filtlong/environment.yml new file mode 100644 index 00000000..746c83a4 --- /dev/null +++ b/modules/nf-core/filtlong/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::filtlong=0.2.1 diff --git a/modules/nf-core/filtlong/main.nf b/modules/nf-core/filtlong/main.nf new file mode 100644 index 00000000..627247fe --- /dev/null +++ b/modules/nf-core/filtlong/main.nf @@ -0,0 +1,39 @@ +process FILTLONG { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/filtlong:0.2.1--h9a82719_0' : + 'biocontainers/filtlong:0.2.1--h9a82719_0' }" + + input: + tuple val(meta), path(shortreads), path(longreads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def short_reads = !shortreads ? "" : meta.single_end ? "-1 $shortreads" : "-1 ${shortreads[0]} -2 ${shortreads[1]}" + if ("$longreads" == "${prefix}.fastq.gz") error "Longread FASTQ input and output names are the same, set prefix in module configuration to disambiguate!" + """ + filtlong \\ + $short_reads \\ + $args \\ + $longreads \\ + 2> >(tee ${prefix}.log >&2) \\ + | gzip -n > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + filtlong: \$( filtlong --version | sed -e "s/Filtlong v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/filtlong/meta.yml b/modules/nf-core/filtlong/meta.yml new file mode 100644 index 00000000..804c1b0d --- /dev/null +++ b/modules/nf-core/filtlong/meta.yml @@ -0,0 +1,65 @@ +name: filtlong +description: Filtlong filters long reads based on quality measures or short read data. +keywords: + - nanopore + - quality control + - QC + - filtering + - long reads + - short reads +tools: + - filtlong: + description: Filtlong is a tool for filtering long reads. It can take a set of + long reads and produce a smaller, better subset. It uses both read length (longer + is better) and read identity (higher is better) when choosing which reads pass + the filter. + homepage: https://anaconda.org/bioconda/filtlong + tool_dev_url: https://github.com/rrwick/Filtlong + licence: ["GPL v3"] + identifier: biotools:filtlong +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - shortreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - longreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" +output: + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fastq.gz": + type: file + description: Filtered (compressed) fastq file + pattern: "*.fastq.gz" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: Standard error logging file containing summary statistics + pattern: "*.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@d4straub" + - "@sofstam" +maintainers: + - "@d4straub" + - "@sofstam" diff --git a/modules/nf-core/filtlong/tests/main.nf.test b/modules/nf-core/filtlong/tests/main.nf.test new file mode 100644 index 00000000..d54ce39c --- /dev/null +++ b/modules/nf-core/filtlong/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_process { + + name "Test Process FILTLONG" + script "../main.nf" + process "FILTLONG" + config "./nextflow.config" + tag "filtlong" + tag "modules" + tag "modules_nfcore" + + test("sarscov2 nanopore [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot( + process.out.reads, + process.out.versions + ).match() + } + ) + } + + } + + + test("sarscov2 nanopore [fastq] + Illumina single-end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot( + process.out.reads, + process.out.versions + ).match() + } + ) + } + + } + + + test("sarscov2 nanopore [fastq] + Illumina paired-end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot( + process.out.reads, + process.out.versions + ).match() + } + ) + } + + } +} diff --git a/modules/nf-core/filtlong/tests/main.nf.test.snap b/modules/nf-core/filtlong/tests/main.nf.test.snap new file mode 100644 index 00000000..1a25c3fc --- /dev/null +++ b/modules/nf-core/filtlong/tests/main.nf.test.snap @@ -0,0 +1,65 @@ +{ + "sarscov2 nanopore [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + [ + "versions.yml:md5,af5988f30157282acdb0ac50ebb4c8cc" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-06T10:51:29.197603" + }, + "sarscov2 nanopore [fastq] + Illumina paired-end [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + [ + "versions.yml:md5,af5988f30157282acdb0ac50ebb4c8cc" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-06T10:51:39.68464" + }, + "sarscov2 nanopore [fastq] + Illumina single-end [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + [ + "versions.yml:md5,af5988f30157282acdb0ac50ebb4c8cc" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.3" + }, + "timestamp": "2024-08-06T10:51:34.404022" + } +} \ No newline at end of file diff --git a/modules/nf-core/filtlong/tests/nextflow.config b/modules/nf-core/filtlong/tests/nextflow.config new file mode 100644 index 00000000..d366b4c3 --- /dev/null +++ b/modules/nf-core/filtlong/tests/nextflow.config @@ -0,0 +1,4 @@ +process { + ext.args = "--min_length 10" + ext.prefix = "test_lr" +} diff --git a/modules/nf-core/freebayes/main.nf b/modules/nf-core/freebayes/main.nf new file mode 100644 index 00000000..1466f085 --- /dev/null +++ b/modules/nf-core/freebayes/main.nf @@ -0,0 +1,51 @@ +process FREEBAYES { + tag "$meta.id" + label 'process_single' + + conda "bioconda::freebayes=1.3.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/freebayes:1.3.6--hbfe0e7f_2' : + 'biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" + + input: + tuple val(meta), path(input_1), path(input_1_index), path(input_2), path(input_2_index), path(target_bed) + path fasta + path fasta_fai + path samples + path populations + path cnv + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = input_2 ? "${input_1} ${input_2}" : "${input_1}" + def targets_file = target_bed ? "--target ${target_bed}" : "" + def samples_file = samples ? "--samples ${samples}" : "" + def populations_file = populations ? "--populations ${populations}" : "" + def cnv_file = cnv ? "--cnv-map ${cnv}" : "" + + """ + freebayes \\ + -f $fasta \\ + $targets_file \\ + $samples_file \\ + $populations_file \\ + $cnv_file \\ + $args \\ + $input > ${prefix}.vcf + + bgzip ${prefix}.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + freebayes: \$(echo \$(freebayes --version 2>&1) | sed 's/version:\s*v//g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/freebayes/meta.yml b/modules/nf-core/freebayes/meta.yml new file mode 100644 index 00000000..17d83cba --- /dev/null +++ b/modules/nf-core/freebayes/meta.yml @@ -0,0 +1,82 @@ +name: freebayes +description: A haplotype-based variant detector +keywords: + - variant caller + - SNP + - genotyping + - somatic variant calling + - germline variant calling + - bacterial variant calling + - bayesian + +tools: + - freebayes: + description: Bayesian haplotype-based polymorphism discovery and genotyping + homepage: https://github.com/freebayes/freebayes + documentation: https://github.com/freebayes/freebayes + tool_dev_url: https://github.com/freebayes/freebayes + doi: "10.48550/arXiv.1207.3907" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai}" + - target_bed: + type: file + description: Optional - Limit analysis to targets listed in this BED-format FILE. + pattern: "*.bed" + - fasta: + type: file + description: reference fasta file + pattern: ".{fa,fa.gz,fasta,fasta.gz}" + - fasta_fai: + type: file + description: reference fasta file index + pattern: "*.{fa,fasta}.fai" + - samples: + type: file + description: Optional - Limit analysis to samples listed (one per line) in the FILE. + pattern: "*.txt" + - populations: + type: file + description: Optional - Each line of FILE should list a sample and a population which it is part of. + pattern: "*.txt" + - cnv: + type: file + description: | + A copy number map BED file, which has either a sample-level ploidy: + sample_name copy_number + or a region-specific format: + seq_name start end sample_name copy_number + pattern: "*.bed" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" + +authors: + - "@maxibor" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/genomad/download/main.nf b/modules/nf-core/genomad/download/main.nf new file mode 100644 index 00000000..a2ac6ecb --- /dev/null +++ b/modules/nf-core/genomad/download/main.nf @@ -0,0 +1,72 @@ +process GENOMAD_DOWNLOAD { + label 'process_single' + + conda "bioconda::genomad=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genomad:1.5.2--pyhdfd78af_0': + 'biocontainers/genomad:1.5.2--pyhdfd78af_0' }" + + output: + path "genomad_db/" , emit: genomad_db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + genomad \\ + download-database . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir genomad_db + touch genomad_db/genomad_db + touch genomad_db/genomad_db.dbtype + touch genomad_db/genomad_db.index + touch genomad_db/genomad_db.lookup + touch genomad_db/genomad_db.source + touch genomad_db/genomad_db_h + touch genomad_db/genomad_db_h.dbtype + touch genomad_db/genomad_db_h.index + touch genomad_db/genomad_db_mapping + touch genomad_db/genomad_db_taxonomy + touch genomad_db/genomad_integrase_db + touch genomad_db/genomad_integrase_db.dbtype + touch genomad_db/genomad_integrase_db.index + touch genomad_db/genomad_integrase_db.lookup + touch genomad_db/genomad_integrase_db.source + touch genomad_db/genomad_integrase_db_h + touch genomad_db/genomad_integrase_db_h.dbtype + touch genomad_db/genomad_integrase_db_h.index + touch genomad_db/genomad_marker_metadata.tsv + touch genomad_db/genomad_mini_db + touch genomad_db/genomad_mini_db.dbtype + touch genomad_db/genomad_mini_db.index + touch genomad_db/genomad_mini_db.lookup + touch genomad_db/genomad_mini_db.source + touch genomad_db/genomad_mini_db_h + touch genomad_db/genomad_mini_db_h.dbtype + touch genomad_db/genomad_mini_db_h.index + touch genomad_db/genomad_mini_db_mapping + touch genomad_db/genomad_mini_db_taxonomy + touch genomad_db/mini_set_ids + touch genomad_db/names.dmp + touch genomad_db/nodes.dmp + touch genomad_db/plasmid_hallmark_annotation.txt + touch genomad_db/version.txt + touch genomad_db/virus_hallmark_annotation.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/genomad/download/meta.yml b/modules/nf-core/genomad/download/meta.yml new file mode 100644 index 00000000..dee0428c --- /dev/null +++ b/modules/nf-core/genomad/download/meta.yml @@ -0,0 +1,31 @@ +name: "genomad_download" +description: Download geNomad databases and related files +keywords: + - metagenomics + - genomad + - database + - download + - phage + - virus + - plasmid +tools: + - "genomad": + description: "Identification of mobile genetic elements" + homepage: https://portal.nersc.gov/genomad/ + documentation: https://portal.nersc.gov/genomad/ + tool_dev_url: https://github.com/apcamargo/genomad/ + doi: 10.1101/2023.03.05.531206 + licence: "['Lawrence Berkeley National Labs BSD variant license']" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - genomad_db: + type: directory + description: Directory containing downloaded data with directory being named "genomad_db" + pattern: "genomad_db" + +authors: + - "@CarsonJM" diff --git a/modules/nf-core/genomad/endtoend/main.nf b/modules/nf-core/genomad/endtoend/main.nf new file mode 100644 index 00000000..48276578 --- /dev/null +++ b/modules/nf-core/genomad/endtoend/main.nf @@ -0,0 +1,82 @@ +process GENOMAD_ENDTOEND { + tag "$meta.id" + label 'process_high' + + conda "bioconda::genomad=1.5.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genomad:1.5.2--pyhdfd78af_0': + 'biocontainers/genomad:1.5.2--pyhdfd78af_0' }" + + input: + tuple val(meta) , path(fasta) + path genomad_db + + output: + tuple val(meta), path("*_aggregated_classification/*_aggregated_classification.tsv") , emit: aggregated_classification + tuple val(meta), path("*_annotate/*_taxonomy.tsv") , emit: taxonomy + tuple val(meta), path("*_find_proviruses/*_provirus.tsv") , emit: provirus + tuple val(meta), path("*_score_calibration/*_compositions.tsv") , emit: compositions , optional: true + tuple val(meta), path("*_score_calibration/*_calibrated_aggregated_classification.tsv") , emit: calibrated_classification , optional: true + tuple val(meta), path("*_summary/*_plasmid.fna") , emit: plasmid_fasta + tuple val(meta), path("*_summary/*_plasmid_genes.tsv") , emit: plasmid_genes + tuple val(meta), path("*_summary/*_plasmid_proteins.faa") , emit: plasmid_proteins + tuple val(meta), path("*_summary/*_plasmid_summary.tsv") , emit: plasmid_summary + tuple val(meta), path("*_summary/*_virus.fna") , emit: virus_fasta + tuple val(meta), path("*_summary/*_virus_genes.tsv") , emit: virus_genes + tuple val(meta), path("*_summary/*_virus_proteins.faa") , emit: virus_proteins + tuple val(meta), path("*_summary/*_virus_summary.tsv") , emit: virus_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + genomad \\ + end-to-end \\ + $fasta \\ + ./ \\ + $genomad_db \\ + --threads $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def filename = "${fasta}"[0..<"${fasta}".lastIndexOf('.')] + """ + mkdir ${filename}_aggregated_classification + touch ${filename}_aggregated_classification/${filename}_aggregated_classification.tsv + mkdir ${filename}_annotate + touch ${filename}_annotate/${filename}_taxonomy.tsv + mkdir ${filename}_find_proviruses + touch ${filename}_find_proviruses/${filename}_provirus.tsv + mkdir ${filename}_marker_classification + mkdir ${filename}_nn_classification + mkdir ${filename}_score_calibration + touch ${filename}_score_calibration/${filename}_calibrated_aggregated_classification.tsv + touch ${filename}_score_calibration/${filename}_compositions.tsv + mkdir ${filename}_summary + touch ${filename}_summary/${filename}_plasmid.fna + touch ${filename}_summary/${filename}_plasmid_genes.tsv + touch ${filename}_summary/${filename}_plasmid_proteins.faa + touch ${filename}_summary/${filename}_plasmid_summary.tsv + touch ${filename}_summary/${filename}_virus.fna + touch ${filename}_summary/${filename}_virus_genes.tsv + touch ${filename}_summary/${filename}_virus_proteins.faa + touch ${filename}_summary/${filename}_virus_summary.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genomad: \$(echo \$(genomad --version 2>&1) | sed 's/^.*geNomad, version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/genomad/endtoend/meta.yml b/modules/nf-core/genomad/endtoend/meta.yml new file mode 100644 index 00000000..b5a6f61f --- /dev/null +++ b/modules/nf-core/genomad/endtoend/meta.yml @@ -0,0 +1,103 @@ +name: "genomad_endtoend" + +description: Identify mobile genetic elements present in genomic assemblies +keywords: + - metagenomics + - genomad + - database + - download + - phage + - virus + - plasmid + +tools: + - "genomad": + description: "Identification of mobile genetic elements" + homepage: https://portal.nersc.gov/genomad/ + documentation: https://portal.nersc.gov/genomad/ + tool_dev_url: https://github.com/apcamargo/genomad/ + doi: 10.1101/2023.03.05.531206 + licence: "['Lawrence Berkeley National Labs BSD variant license']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file containing contigs/scaffolds/chromosomes + pattern: "*.{fasta,fna,fa}" + - genomad_db: + type: directory + description: Directory pointing to geNomad database + - score_calibration: + type: boolean + description: true/false value to indicate if score calibration should be enabled + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - aggregated_classification: + type: file + description: Combined classification scores for each contig/scaffold/chromosome + pattern: "*_aggregated_classification.tsv" + - taxonomy: + type: file + description: Detailed output of geNomad's marker gene taxonomy analysis + pattern: "*_taxonomy.tsv" + - provirus: + type: file + description: Detailed output of each provirus identified by geNomad's find_proviruses module + pattern: "*_provirus.tsv" + - compositions: + type: file + description: OPTIONAL - Predicted sample composition when `--enable-score-calibration` is used + pattern: "*_compositions.tsv" + - calibrated_classification: + type: file + description: OPTIONAL - Classification scores that have been adjusted based on sample composition when `--enable-score-calibration` is used` + pattern: "*_calibrated_aggregated_classification.tsv" + - plasmid_fasta: + type: file + description: FASTA file containing predicted plasmid sequences + pattern: "*_plasmid.fna" + - plasmid_genes: + type: file + description: TSV file containing predicted plasmid genes and their annotations + pattern: "*_plasmid_genes.tsv" + - plasmid_proteins: + type: file + description: FASTA file containing predicted plasmid protein sequences + pattern: "*_plasmid_proteins.faa" + - plasmid_summary: + type: file + description: TSV file containing a summary of geNomad's plasmid predictions + pattern: "*_plasmid_summary.tsv" + - virus_fasta: + type: file + description: FASTA file containing predicted virus sequences + pattern: "*_virus.fna" + - virus_genes: + type: file + description: TSV file containing predicted virus genes and their annotations + pattern: "*_virus_genes.tsv" + - virus_proteins: + type: file + description: FASTA file containing predicted virus protein sequences + pattern: "*_virus_proteins.faa" + - virus_summary: + type: file + description: TSV file containing a summary of geNomad's virus predictions + pattern: "*_virus_summary.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@CarsonJM" diff --git a/modules/nf-core/gtdbtk/classifywf/environment.yml b/modules/nf-core/gtdbtk/classifywf/environment.yml new file mode 100644 index 00000000..500531ea --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::gtdbtk=2.4.0 diff --git a/modules/nf-core/gtdbtk/classifywf/main.nf b/modules/nf-core/gtdbtk/classifywf/main.nf new file mode 100644 index 00000000..23862fee --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/main.nf @@ -0,0 +1,93 @@ +process GTDBTK_CLASSIFYWF { + tag "${prefix}" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gtdbtk:2.4.0--pyhdfd78af_1' : 'biocontainers/gtdbtk:2.4.0--pyhdfd78af_1'}" + + input: + tuple val(meta) , path("bins/*") + tuple val(db_name), path("database/*") + val use_pplacer_scratch_dir + path mash_db + + output: + tuple val(meta), path("gtdbtk.${prefix}.*.summary.tsv") , emit: summary + tuple val(meta), path("gtdbtk.${prefix}.*.classify.tree.gz") , emit: tree , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.markers_summary.tsv"), emit: markers , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.msa.fasta.gz") , emit: msa , optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.user_msa.fasta.gz") , emit: user_msa, optional: true + tuple val(meta), path("gtdbtk.${prefix}.*.filtered.tsv") , emit: filtered, optional: true + tuple val(meta), path("gtdbtk.${prefix}.failed_genomes.tsv") , emit: failed , optional: true + tuple val(meta), path("gtdbtk.${prefix}.log") , emit: log + tuple val(meta), path("gtdbtk.${prefix}.warnings.log") , emit: warnings + path ("versions.yml"), emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def pplacer_scratch = use_pplacer_scratch_dir ? "--scratch_dir pplacer_tmp" : "" + def mash_mode = mash_db ? "--mash_db ${mash_db}" : "--skip_ani_screen" + prefix = task.ext.prefix ?: "${meta.id}" + + """ + export GTDBTK_DATA_PATH="\${PWD}/database" + if [ ${pplacer_scratch} != "" ] ; then + mkdir pplacer_tmp + fi + + gtdbtk classify_wf \\ + ${args} \\ + --genome_dir bins \\ + --prefix "gtdbtk.${prefix}" \\ + --out_dir "\${PWD}" \\ + --cpus ${task.cpus} \\ + ${mash_mode} \\ + ${pplacer_scratch} + + ## If mash db given, classify/ and identify/ directories won't be created + if [[ -d classify/ && \$(ls -A classify/) ]]; then + mv classify/* . + fi + + if [[ -d identify/ && \$(ls -A identify/) ]]; then + mv identify/* . + fi + + ## If nothing aligns, no output, so only run + if [[ -d align/ && \$(ls -A align/) ]]; then + mv align/* . + fi + + mv gtdbtk.log "gtdbtk.${prefix}.log" + + mv gtdbtk.warnings.log "gtdbtk.${prefix}.warnings.log" + + find -name "gtdbtk.${prefix}.*.classify.tree" | xargs -r gzip # do not fail if .tree is missing + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch gtdbtk.${prefix}.stub.summary.tsv + echo "" | gzip > gtdbtk.${prefix}.stub.classify.tree.gz + touch gtdbtk.${prefix}.stub.markers_summary.tsv + echo "" | gzip > gtdbtk.${prefix}.stub.msa.fasta.gz + echo "" | gzip > gtdbtk.${prefix}.stub.user_msa.fasta.gz + touch gtdbtk.${prefix}.stub.filtered.tsv + touch gtdbtk.${prefix}.log + touch gtdbtk.${prefix}.warnings.log + touch gtdbtk.${prefix}.failed_genomes.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gtdbtk: \$(echo \$(gtdbtk --version -v 2>&1) | sed "s/gtdbtk: version //; s/ Copyright.*//") + END_VERSIONS + """ +} diff --git a/modules/nf-core/gtdbtk/classifywf/meta.yml b/modules/nf-core/gtdbtk/classifywf/meta.yml new file mode 100644 index 00000000..0667dcd6 --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/meta.yml @@ -0,0 +1,91 @@ +name: gtdbtk_classifywf +description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. +keywords: + - GTDB taxonomy + - taxonomic classification + - metagenomics + - classification + - genome taxonomy database + - bacteria + - archaea +tools: + - gtdbtk: + description: GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy GTDB. + homepage: https://ecogenomics.github.io/GTDBTk/ + documentation: https://ecogenomics.github.io/GTDBTk/ + tool_dev_url: https://github.com/Ecogenomics/GTDBTk + doi: "10.1093/bioinformatics/btz848" + licence: ["GNU General Public v3 (GPL v3)"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, assembler:'spades' ] + - bins: + type: file + description: The binned fasta files from the assembler + pattern: "*.{fasta,fa}" + - database: + type: file + description: The local copy of the taxonomic database used by GTDB-tk (unzipped copy) + pattern: "*" + - use_pplacer_scratch_dir: + type: boolean + description: Set to true to reduce pplacer memory usage by writing to disk (slower) + - mash_db: + type: file + description: The local copy of the Mash sketch database used by GTDB-tk if `ani_screen` mode is used (optional) + pattern: "*.msh" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - summary: + type: file + description: A TSV summary file for the classification + pattern: "*.{summary.tsv}" + - tree: + type: file + description: NJ or UPGMA tree in Newick format produced from a multiple sequence alignment + pattern: "*.{classify.tree.gz}" + - markers: + type: file + description: A TSV summary file lineage markers used for the classification. + pattern: "*.{markers_summary.tsv}" + - msa: + type: file + description: Multiple sequence alignments file. + pattern: "*.{msa.fasta.gz}" + - user_msa: + type: file + description: Multiple sequence alignments file for the user-provided files. + pattern: "*.{user_msa.fasta.gz}" + - filtered: + type: file + description: A list of genomes with an insufficient number of amino acids in MSA.. + pattern: "*.{filtered.tsv}" + - log: + type: file + description: GTDB-tk log file + pattern: "*.{log}" + - warnings: + type: file + description: GTDB-tk warnings log file + pattern: "*.{warnings.log}" + - failed: + type: file + description: A TSV summary of the genomes which GTDB-tk failed to classify. + pattern: "*.{failed_genomes.tsv}" +authors: + - "@skrakau" + - "@abhi18av" +maintainers: + - "@skrakau" + - "@abhi18av" diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test new file mode 100644 index 00000000..deca962d --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test @@ -0,0 +1,42 @@ +nextflow_process { + + name "Test Process GTDBTK_CLASSIFYWF" + script "../main.nf" + process "GTDBTK_CLASSIFYWF" + + tag "modules" + tag "modules_nfcore" + tag "gtdbtk" + tag "gtdbtk/classifywf" + + // Only stub test is possible due to very large required database (>70GB) + test("sarscov2 - genome fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false, assembler:'SPADES' ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/scaffolds.fasta', checkIfExists: true), + ] + ] + input[1] = [[], []] + input[2] = false + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap new file mode 100644 index 00000000..eb0ee89a --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/main.nf.test.snap @@ -0,0 +1,199 @@ +{ + "sarscov2 - genome fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.classify.tree.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "8": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "9": [ + "versions.yml:md5,2c94de2b8633b99e11881ab0193835d7" + ], + "failed": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.failed_genomes.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "filtered": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.filtered.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "markers": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.markers_summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "summary": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.summary.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tree": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.classify.tree.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "user_msa": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.stub.user_msa.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,2c94de2b8633b99e11881ab0193835d7" + ], + "warnings": [ + [ + { + "id": "test", + "single_end": false, + "assembler": "SPADES" + }, + "gtdbtk.test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-16T11:46:32.337929018" + } +} \ No newline at end of file diff --git a/modules/nf-core/gtdbtk/classifywf/tests/tags.yml b/modules/nf-core/gtdbtk/classifywf/tests/tags.yml new file mode 100644 index 00000000..5d8badac --- /dev/null +++ b/modules/nf-core/gtdbtk/classifywf/tests/tags.yml @@ -0,0 +1,2 @@ +gtdbtk/classifywf: + - "modules/nf-core/gtdbtk/classifywf/**" diff --git a/modules/nf-core/gunc/downloaddb/main.nf b/modules/nf-core/gunc/downloaddb/main.nf new file mode 100644 index 00000000..a080d8f2 --- /dev/null +++ b/modules/nf-core/gunc/downloaddb/main.nf @@ -0,0 +1,30 @@ +process GUNC_DOWNLOADDB { + tag "$db_name" + label 'process_single' + + conda "bioconda::gunc=1.0.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.5--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.5--pyhdfd78af_0' }" + + input: + val db_name + + output: + path "*.dmnd" , emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + gunc download_db . -db $db_name $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/downloaddb/meta.yml b/modules/nf-core/gunc/downloaddb/meta.yml new file mode 100644 index 00000000..c36ff3f3 --- /dev/null +++ b/modules/nf-core/gunc/downloaddb/meta.yml @@ -0,0 +1,36 @@ +name: gunc_downloaddb +description: Download database for GUNC detection of Chimerism and Contamination in Prokaryotic Genomes +keywords: + - download + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + +input: + - db_name: + type: string + description: "Which database to download. Options: progenomes or gtdb" + pattern: "progenomes|gtdb" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: file + description: GUNC database file + pattern: "*.dmnd" + +authors: + - "@jfy133" diff --git a/modules/nf-core/gunc/mergecheckm/environment.yml b/modules/nf-core/gunc/mergecheckm/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/mergecheckm/main.nf b/modules/nf-core/gunc/mergecheckm/main.nf new file mode 100644 index 00000000..611f916c --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/main.nf @@ -0,0 +1,46 @@ +process GUNC_MERGECHECKM { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" + + input: + tuple val(meta), path(gunc_file), path(checkm_file) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gunc \\ + merge_checkm \\ + $args \\ + -g $gunc_file \\ + -c $checkm_file \\ + -o . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ + + stub: + """ + touch gunc_merge_checkm.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/mergecheckm/meta.yml b/modules/nf-core/gunc/mergecheckm/meta.yml new file mode 100644 index 00000000..4a7a2c1c --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/meta.yml @@ -0,0 +1,55 @@ +name: "gunc_mergecheckm" +description: Merging of CheckM and GUNC results in one summary table +keywords: + - gunc + - checkm + - summary + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + identifier: biotools:gunc +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gunc_file: + type: file + description: Path of a gunc_scores.tsv file (mandatory) + pattern: "*.{bam,cram,sam}" + - checkm_file: + type: file + description: Output TSV from CheckM qa (ideally with -o 2 extended format) (mandatory) + pattern: "*.{bam,cram,sam}" +output: + - tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv": + type: file + description: Merged checkm/gunc results in TSV format + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test new file mode 100644 index 00000000..dbd67b90 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test @@ -0,0 +1,175 @@ +nextflow_process { + + name "Test Process GUNC_MERGECHECKM" + script "../main.nf" + process "GUNC_MERGECHECKM" + config "./nextflow.config" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/mergecheckm" + tag "gunc/run" + tag "gunc/downloaddb" + tag "checkm/lineagewf" + tag "checkm/qa" + + // commented out because GitHub runners are not able to run this test + // test("gunc - mergecheckm") { + + // setup { + // run("CHECKM_LINEAGEWF") { + // script "../../../checkm/lineagewf/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], // meta map + // file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // ) + // ] + // input[1] = 'fasta' + // input[2] = [] // Download CheckM database + // """ + // } + // } + + // run("CHECKM_QA") { + // script "../../../checkm/qa/main.nf" + // process { + // """ + // input[0] = CHECKM_LINEAGEWF.out.checkm_output + // .join(CHECKM_LINEAGEWF.out.marker_file) + // .map { sample_data -> sample_data + [file('NO_FILE')] } + // input[1] = [] + // """ + // } + // } + + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + + // run("GUNC_RUN") { + // script "../../run/main.nf" + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - mergecheckm - stub") { + + options "-stub" + + setup { + run("CHECKM_LINEAGEWF") { + script "../../../checkm/lineagewf/main.nf" + process { + """ + input[0] = [ + [id: 'test'], // meta map + file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + ) + ] + input[1] = 'fasta' + input[2] = [] // Download CheckM database + """ + } + } + + run("CHECKM_QA") { + script "../../../checkm/qa/main.nf" + process { + """ + input[0] = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { v -> v + [file('NO_FILE')] } + input[1] = [] + """ + } + } + + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + + run("GUNC_RUN") { + script "../../run/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = GUNC_RUN.out.maxcss_level_tsv.join(CHECKM_QA.out.output) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap new file mode 100644 index 00000000..807c23f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "gunc - mergecheckm": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "GUNC_checkM.merged.tsv:md5,24cbd3c76a36cb90ac993c83525a2c1b" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T09:37:48.146410153" + }, + "gunc - mergecheckm - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ], + "tsv": [ + [ + { + "id": "test" + }, + "gunc_merge_checkm.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a94747201129170b1cfbce5e59de62b0" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T16:47:06.752273424" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/mergecheckm/tests/nextflow.config b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config new file mode 100644 index 00000000..1e9ba3dc --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: CHECKM_QA { + ext.args = '--tab_table' + } +} diff --git a/modules/nf-core/gunc/mergecheckm/tests/tags.yml b/modules/nf-core/gunc/mergecheckm/tests/tags.yml new file mode 100644 index 00000000..d05282f2 --- /dev/null +++ b/modules/nf-core/gunc/mergecheckm/tests/tags.yml @@ -0,0 +1,6 @@ +gunc/run: + - modules/nf-core/gunc/mergecheckm/** + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** + - modules/nf-core/checkm/lineagewf/** + - modules/nf-core/checkm/qa/** diff --git a/modules/nf-core/gunc/run/environment.yml b/modules/nf-core/gunc/run/environment.yml new file mode 100644 index 00000000..3a0264f4 --- /dev/null +++ b/modules/nf-core/gunc/run/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::gunc=1.0.6 diff --git a/modules/nf-core/gunc/run/main.nf b/modules/nf-core/gunc/run/main.nf new file mode 100644 index 00000000..9ee614e4 --- /dev/null +++ b/modules/nf-core/gunc/run/main.nf @@ -0,0 +1,49 @@ +process GUNC_RUN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gunc:1.0.6--pyhdfd78af_0' : + 'biocontainers/gunc:1.0.6--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta_files, stageAs: 'input_files/*') + path(db) + + output: + tuple val(meta), path("*maxCSS_level.tsv") , emit: maxcss_level_tsv + tuple val(meta), path("*all_levels.tsv") , optional: true, emit: all_levels_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + ls input_files/* > input_files.txt + gunc \\ + run \\ + --input_file input_files.txt \\ + --db_file $db \\ + --threads $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ + + stub: + """ + touch maxCSS_level.tsv all_levels.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunc: \$( gunc --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunc/run/meta.yml b/modules/nf-core/gunc/run/meta.yml new file mode 100644 index 00000000..3ecc0b74 --- /dev/null +++ b/modules/nf-core/gunc/run/meta.yml @@ -0,0 +1,62 @@ +name: gunc_run +description: Detection of Chimerism and Contamination in Prokaryotic Genomes +keywords: + - prokaryote + - assembly + - genome + - quality control + - chimeras +tools: + - gunc: + description: Python package for detection of chimerism and contamination in prokaryotic + genomes. + homepage: https://grp-bork.embl-community.io/gunc/ + documentation: https://grp-bork.embl-community.io/gunc/ + tool_dev_url: https://github.com/grp-bork/gunc + doi: "10.1186/s13059-021-02393-0" + licence: ["GNU General Public v3 or later (GPL v3+)"] + identifier: biotools:gunc +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_files: + type: file + description: A list of FASTA files containing contig (bins) + pattern: "*.fa" + - - db: + type: file + description: GUNC database file + pattern: "*.dmnd" +output: + - maxcss_level_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*maxCSS_level.tsv": + type: file + description: Output file with results for the maximum CSS level + pattern: "*.tsv" + - all_levels_tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*all_levels.tsv": + type: file + description: Optional output file with results for each taxonomic level + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/gunc/run/tests/main.nf.test b/modules/nf-core/gunc/run/tests/main.nf.test new file mode 100644 index 00000000..c1659f0c --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test @@ -0,0 +1,96 @@ +nextflow_process { + + name "Test Process GUNC_RUN" + script "../main.nf" + process "GUNC_RUN" + + tag "modules_nfcore" + tag "modules" + tag "gunc" + tag "gunc/run" + tag "gunc/downloaddb" + + // commented out because GitHub runners are not able to run this test + // test("gunc - run") { + + // setup { + // run("GUNC_DOWNLOADDB") { + // script "../../downloaddb/main.nf" + // process { + // """ + // input[0] = 'progenomes' + // """ + // } + // } + // } + + // when { + // params { + // outdir = "${launchDir}/tests/results" + // } + // process { + // """ + // input[0] = [ + // [id: 'test'], + // [file( + // params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + // checkIfExists: true + // )] + // ] + // input[1] = GUNC_DOWNLOADDB.out.db + // """ + // } + // } + + // then { + // assertAll( + // { assert process.success }, + // { assert snapshot(process.out).match() } + // ) + // } + + // } + + test("gunc - run - stub") { + + options "-stub" + + setup { + run("GUNC_DOWNLOADDB") { + script "../../downloaddb/main.nf" + process { + """ + input[0] = 'progenomes' + """ + } + } + } + + when { + params { + outdir = "${launchDir}/tests/results" + } + process { + """ + input[0] = [ + [id: 'test'], + [file( + params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fasta/contigs.fasta', + checkIfExists: true + )] + ] + input[1] = GUNC_DOWNLOADDB.out.db + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/main.nf.test.snap b/modules/nf-core/gunc/run/tests/main.nf.test.snap new file mode 100644 index 00000000..516425c8 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/main.nf.test.snap @@ -0,0 +1,90 @@ +{ + "gunc - run - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + [ + { + "id": "test" + }, + "all_levels.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "maxCSS_level.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-21T17:29:46.904708749" + }, + "gunc - run": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ], + "all_levels_tsv": [ + + ], + "maxcss_level_tsv": [ + [ + { + "id": "test" + }, + "GUNC.progenomes_2.1.maxCSS_level.tsv:md5,938826458a44404d0bf2e7cb4edde405" + ] + ], + "versions": [ + "versions.yml:md5,2ee4942c0187a663aed4b66af3bead6a" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.0" + }, + "timestamp": "2024-11-22T10:12:03.813571948" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunc/run/tests/tags.yml b/modules/nf-core/gunc/run/tests/tags.yml new file mode 100644 index 00000000..0af96444 --- /dev/null +++ b/modules/nf-core/gunc/run/tests/tags.yml @@ -0,0 +1,3 @@ +gunc/run: + - modules/nf-core/gunc/run/** + - modules/nf-core/gunc/downloaddb/** diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..73bf08cd --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..4cdcdf4c --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,35 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/krakentools/kreport2krona/environment.yml b/modules/nf-core/krakentools/kreport2krona/environment.yml new file mode 100644 index 00000000..ea49a77c --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/environment.yml @@ -0,0 +1,7 @@ +name: krakentools_kreport2krona +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::krakentools=1.2 diff --git a/modules/nf-core/krakentools/kreport2krona/main.nf b/modules/nf-core/krakentools/kreport2krona/main.nf new file mode 100644 index 00000000..f9f27001 --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/main.nf @@ -0,0 +1,36 @@ +process KRAKENTOOLS_KREPORT2KRONA { + tag "$meta.id" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreport) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + kreport2krona.py \\ + -r ${kreport} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kreport2krona.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/kreport2krona/meta.yml b/modules/nf-core/krakentools/kreport2krona/meta.yml new file mode 100644 index 00000000..7a5dda4a --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/meta.yml @@ -0,0 +1,40 @@ +name: krakentools_kreport2krona +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krona + - metagenomics + - visualization +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreport: + type: file + description: Kraken report + pattern: "*.{txt,kreport}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - krona: + type: file + description: Krona text-based input file converted from Kraken report + pattern: "*.{txt,krona}" +authors: + - "@MillironX" +maintainers: + - "@MillironX" diff --git a/modules/nf-core/krona/kronadb/environment.yml b/modules/nf-core/krona/kronadb/environment.yml new file mode 100644 index 00000000..1646628f --- /dev/null +++ b/modules/nf-core/krona/kronadb/environment.yml @@ -0,0 +1,7 @@ +name: krona_kronadb +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::krona=2.7.1 diff --git a/modules/nf-core/krona/kronadb/main.nf b/modules/nf-core/krona/kronadb/main.nf new file mode 100644 index 00000000..1d9bf698 --- /dev/null +++ b/modules/nf-core/krona/kronadb/main.nf @@ -0,0 +1,30 @@ +def VERSION='2.7.1' // Version information not provided by tool on CLI + +process KRONA_KRONADB { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.7.1--pl526_5' : + 'biocontainers/krona:2.7.1--pl526_5' }" + + output: + path 'taxonomy/taxonomy.tab', emit: db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + ktUpdateTaxonomy.sh \\ + $args \\ + taxonomy/ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/kronadb/meta.yml b/modules/nf-core/krona/kronadb/meta.yml new file mode 100644 index 00000000..0d42bb10 --- /dev/null +++ b/modules/nf-core/krona/kronadb/meta.yml @@ -0,0 +1,26 @@ +name: krona_kronadb +description: KronaTools Update Taxonomy downloads a taxonomy database +keywords: + - database + - taxonomy + - krona +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: https://github.com/marbl/Krona/wiki/Installing + doi: 10.1186/1471-2105-12-385 +# There is no input. This module downloads a pre-built taxonomy database for use with Krona Tools. +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: file + description: A TAB separated file that contains a taxonomy database. + pattern: "*.{tab}" +authors: + - "@mjakobs" +maintainers: + - "@mjakobs" diff --git a/modules/nf-core/krona/ktimporttaxonomy/environment.yml b/modules/nf-core/krona/ktimporttaxonomy/environment.yml new file mode 100644 index 00000000..1909e15f --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/environment.yml @@ -0,0 +1,7 @@ +name: krona_ktimporttaxonomy +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::krona=2.8 diff --git a/modules/nf-core/krona/ktimporttaxonomy/main.nf b/modules/nf-core/krona/ktimporttaxonomy/main.nf new file mode 100644 index 00000000..5a9f3ff8 --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/main.nf @@ -0,0 +1,41 @@ +process KRONA_KTIMPORTTAXONOMY { + tag "${meta.id}" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.8--pl5262hdfd78af_2' : + 'biocontainers/krona:2.8--pl5262hdfd78af_2' }" + + input: + tuple val(meta), path(report) + path taxonomy, stageAs: 'taxonomy.tab' + + output: + tuple val(meta), path ('*.html'), emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.8' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;) + echo \$TAXONOMY + + ktImportTaxonomy \\ + $args \\ + -o ${prefix}.html \\ + -tax \$TAXONOMY/ \\ + $report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/ktimporttaxonomy/meta.yml b/modules/nf-core/krona/ktimporttaxonomy/meta.yml new file mode 100644 index 00000000..de548210 --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/meta.yml @@ -0,0 +1,45 @@ +name: krona_ktimporttaxonomy +description: KronaTools Import Taxonomy imports taxonomy classifications and produces an interactive Krona plot. +keywords: + - plot + - taxonomy + - interactive + - html + - visualisation + - krona chart +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html + doi: 10.1186/1471-2105-12-385 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - database: + type: file + description: | + Path to a Krona taxonomy .tab file normally downloaded and generated by + krona/ktUpdateTaxonomy. Custom taxonomy files can have any name, but + must end in `.tab`. + pattern: "*tab" + - report: + type: file + description: "A tab-delimited file with taxonomy IDs and (optionally) query IDs, magnitudes, and scores. Query IDs are taken from column 1, taxonomy IDs from column 2, and scores from column 3. Lines beginning with # will be ignored." + pattern: "*.{tsv}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - html: + type: file + description: A html file containing an interactive krona plot. + pattern: "*.{html}" +authors: + - "@mjakobs" +maintainers: + - "@mjakobs" diff --git a/modules/nf-core/maxbin2/environment.yml b/modules/nf-core/maxbin2/environment.yml new file mode 100644 index 00000000..8a881999 --- /dev/null +++ b/modules/nf-core/maxbin2/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::maxbin2=2.2.7 diff --git a/modules/nf-core/maxbin2/main.nf b/modules/nf-core/maxbin2/main.nf new file mode 100644 index 00000000..845c8e4e --- /dev/null +++ b/modules/nf-core/maxbin2/main.nf @@ -0,0 +1,57 @@ +process MAXBIN2 { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/maxbin2:2.2.7--he1b5a44_2' : + 'biocontainers/maxbin2:2.2.7--he1b5a44_2' }" + + input: + tuple val(meta), path(contigs), path(reads), path(abund) + + output: + tuple val(meta), path("*.fasta.gz") , emit: binned_fastas + tuple val(meta), path("*.summary") , emit: summary + tuple val(meta), path("*.abundance") , emit: abundance , optional: true + tuple val(meta), path("*.log.gz") , emit: log + tuple val(meta), path("*.marker.gz") , emit: marker_counts + tuple val(meta), path("*.noclass.gz") , emit: unbinned_fasta + tuple val(meta), path("*.tooshort.gz"), emit: tooshort_fasta + tuple val(meta), path("*_bin.tar.gz") , emit: marker_bins , optional: true + tuple val(meta), path("*_gene.tar.gz"), emit: marker_genes, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (reads && abund) { error("ERROR: MaxBin2 can only accept one of `reads` or `abund`, no both. Check input.") } + def associate_files = "" + if ( reads ) { + associate_files = "-reads $reads" + } else if ( abund instanceof List ) { + associate_files = "-abund ${abund[0]}" + for (i in 2..abund.size()) { associate_files += " -abund$i ${abund[i-1]}" } + } else { + associate_files = "-abund $abund" + } + """ + mkdir input/ && mv $contigs input/ + run_MaxBin.pl \\ + -contig input/$contigs \\ + $associate_files \\ + -thread $task.cpus \\ + $args \\ + -out $prefix + + gzip *.fasta *.noclass *.tooshort *log *.marker + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + maxbin2: \$( run_MaxBin.pl -v | head -n 1 | sed 's/MaxBin //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/maxbin2/meta.yml b/modules/nf-core/maxbin2/meta.yml new file mode 100644 index 00000000..9546afb1 --- /dev/null +++ b/modules/nf-core/maxbin2/meta.yml @@ -0,0 +1,143 @@ +name: maxbin2 +description: MaxBin is a software that is capable of clustering metagenomic contigs +keywords: + - metagenomics + - assembly + - binning + - maxbin2 + - de novo assembly + - mags + - metagenome-assembled genomes + - contigs +tools: + - maxbin2: + description: MaxBin is software for binning assembled metagenomic sequences based + on an Expectation-Maximization algorithm. + homepage: https://sourceforge.net/projects/maxbin/ + documentation: https://sourceforge.net/projects/maxbin/ + tool_dev_url: https://sourceforge.net/projects/maxbin/ + doi: "10.1093/bioinformatics/btv638" + licence: ["BSD 3-clause"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - contigs: + type: file + description: Multi FASTA file containing assembled contigs of a given sample + pattern: "*.fasta" + - reads: + type: file + description: Reads used to assemble contigs in FASTA or FASTQ format. Do not + supply at the same time as abundance files. + pattern: "*.fasta" + - abund: + type: list + description: One or more contig abundance files, i.e. average depth of reads against each contig. See MaxBin2 + README for details. Do not supply at the same time as read files. +output: + - binned_fastas: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fasta.gz": + type: file + description: Binned contigs, one per bin designated with numeric IDs + pattern: "*.fasta.gz" + - summary: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.summary": + type: file + description: Summary file describing which contigs are being classified into + which bin + pattern: "*.summary" + - abundance: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.abundance": + type: file + description: Abundance of each bin if multiple abundance files were supplied + which bin + pattern: "*.abundance" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log.gz": + type: file + description: Log file recording the core steps of MaxBin algorithm + pattern: "*.log.gz" + - marker_counts: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.marker.gz": + type: file + description: Marker counts + pattern: "*.marker.gz" + - unbinned_fasta: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.noclass.gz": + type: file + description: All sequences that pass the minimum length threshold but are not + classified successfully. + pattern: "*.noclass.gz" + - tooshort_fasta: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tooshort.gz": + type: file + description: All sequences that do not meet the minimum length threshold. + pattern: "*.tooshort.gz" + - marker_bins: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_bin.tar.gz": + type: file + description: Marker bins + pattern: "*_bin.tar.gz" + - marker_genes: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*_gene.tar.gz": + type: file + description: Marker genes + pattern: "*_gene.tar.gz" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/maxbin2/tests/main.nf.test b/modules/nf-core/maxbin2/tests/main.nf.test new file mode 100644 index 00000000..efb23c2b --- /dev/null +++ b/modules/nf-core/maxbin2/tests/main.nf.test @@ -0,0 +1,47 @@ + +nextflow_process { + + name "Test Process MAXBIN2" + script "../main.nf" + process "MAXBIN2" + + tag "modules" + tag "modules_nfcore" + tag "maxbin2" + + test("test-maxbin2") { + + when { + process { + """ + input[0] = [ + [ id:'test1', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fasta/test1.contigs.fa.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/prokaryotes/bacteroides_fragilis/illumina/fastq/test1_1.fastq.gz', checkIfExists: true), + [] + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.binned_fastas, + process.out.summary, + file(process.out.log[0][1]).name, + process.out.marker_counts, + file(process.out.unbinned_fasta[0][1]).name, // empty + process.out.tooshort_fasta, + file(process.out.marker_bins[0][1]).name, // unstable + process.out.marker_genes, + process.out.versions + ).match() + } + ) + } + } + +} diff --git a/modules/nf-core/maxbin2/tests/main.nf.test.snap b/modules/nf-core/maxbin2/tests/main.nf.test.snap new file mode 100644 index 00000000..caecef8e --- /dev/null +++ b/modules/nf-core/maxbin2/tests/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "test-maxbin2": { + "content": [ + [ + [ + { + "id": "test1", + "single_end": false + }, + [ + "test1.001.fasta.gz:md5,92eeca569534d770af91a1c07e62afa9", + "test1.002.fasta.gz:md5,628ef3b2e6647aed95511c28ea0dc229" + ] + ] + ], + [ + [ + { + "id": "test1", + "single_end": false + }, + "test1.summary:md5,7cdbedbfadd7a96203bdeca55ad822da" + ] + ], + "test1.log.gz", + [ + [ + { + "id": "test1", + "single_end": false + }, + "test1.marker.gz:md5,928994e84b9d723a8a48841432e1a262" + ] + ], + "test1.noclass.gz", + [ + [ + { + "id": "test1", + "single_end": false + }, + "test1.tooshort.gz:md5,b4e48e83637217aa9eba7f27f5990b24" + ] + ], + "test1.marker_of_each_bin.tar.gz", + [ + + ], + [ + "versions.yml:md5,a8b5754ee5df020d62ff25306376fc0a" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-30T14:56:43.557114" + } +} \ No newline at end of file diff --git a/modules/nf-core/megahit/environment.yml b/modules/nf-core/megahit/environment.yml new file mode 100644 index 00000000..eed8b725 --- /dev/null +++ b/modules/nf-core/megahit/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::megahit=1.2.9 + - conda-forge::pigz=2.8 diff --git a/modules/nf-core/megahit/main.nf b/modules/nf-core/megahit/main.nf new file mode 100644 index 00000000..db061242 --- /dev/null +++ b/modules/nf-core/megahit/main.nf @@ -0,0 +1,70 @@ +process MEGAHIT { + tag "${meta.id}" + label 'process_high' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f2/f2cb827988dca7067ff8096c37cb20bc841c878013da52ad47a50865d54efe83/data' : + 'community.wave.seqera.io/library/megahit_pigz:87a590163e594224' }" + + input: + tuple val(meta), path(reads1), path(reads2) + + output: + tuple val(meta), path("*.contigs.fa.gz") , emit: contigs + tuple val(meta), path("intermediate_contigs/k*.contigs.fa.gz") , emit: k_contigs + tuple val(meta), path("intermediate_contigs/k*.addi.fa.gz") , emit: addi_contigs + tuple val(meta), path("intermediate_contigs/k*.local.fa.gz") , emit: local_contigs + tuple val(meta), path("intermediate_contigs/k*.final.contigs.fa.gz"), emit: kfinal_contigs + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end || !reads2 ? "-r ${reads1.join(',')}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" + """ + megahit \\ + ${reads_command} \\ + ${args} \\ + -t ${task.cpus} \\ + --out-prefix ${prefix} + + pigz \\ + --no-name \\ + -p ${task.cpus} \\ + ${args2} \\ + megahit_out/*.fa \\ + megahit_out/intermediate_contigs/*.fa + + mv megahit_out/* . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end || !reads2 ? "-r ${reads1}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" + """ + mkdir -p intermediate_contigs + echo "" | gzip > ${prefix}.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.addi.fa.gz + echo "" | gzip > intermediate_contigs/k21.local.fa.gz + echo "" | gzip > intermediate_contigs/k21.final.contigs.fa.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/megahit/meta.yml b/modules/nf-core/megahit/meta.yml new file mode 100644 index 00000000..04dab4c2 --- /dev/null +++ b/modules/nf-core/megahit/meta.yml @@ -0,0 +1,114 @@ +name: megahit +description: An ultra-fast metagenomic assembler for large and complex metagenomics +keywords: + - megahit + - denovo + - assembly + - debruijn + - metagenomics +tools: + - megahit: + description: "An ultra-fast single-node solution for large and complex metagenomics + assembly via succinct de Bruijn graph" + homepage: https://github.com/voutcn/megahit + documentation: https://github.com/voutcn/megahit + tool_dev_url: https://github.com/voutcn/megahit + doi: "10.1093/bioinformatics/btv033" + licence: ["GPL v3"] + args_id: "$args" + identifier: biotools:megahit + - pigz: + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + args_id: "$args2" + + identifier: biotools:megahit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information and input single, or paired-end FASTA/FASTQ files (optionally decompressed) + e.g. [ id:'test', single_end:false ] + - reads1: + type: file + description: | + A single or list of input FastQ files for single-end or R1 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. + - reads2: + type: file + description: | + A single or list of input FastQ files for R2 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. +output: + - contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.contigs.fa.gz": + type: file + description: Final final contigs result of the assembly in FASTA format. + pattern: "*.contigs.fa.gz" + - k_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.contigs.fa.gz: + type: file + description: Contigs assembled from the de Bruijn graph of order-K + pattern: "k*.contigs.fa.gz" + - addi_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.addi.fa.gz: + type: file + description: Contigs assembled after iteratively removing local low coverage + unitigs in the de Bruijn graph of order-K + pattern: "k*.addi.fa.gz" + - local_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.local.fa.gz: + type: file + description: Contigs of the locally assembled contigs for k=K + pattern: "k*.local.fa.gz" + - kfinal_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.final.contigs.fa.gz: + type: file + description: Stand-alone contigs for k=K; if local assembly is turned on, the + file will be empty + pattern: "k*.final.contigs.fa.gz" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: Log file containing statistics of the assembly output + pattern: "*.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/megahit/tests/main.nf.test b/modules/nf-core/megahit/tests/main.nf.test new file mode 100644 index 00000000..b52765d4 --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test @@ -0,0 +1,126 @@ +nextflow_process { + + name "Test Process MEGAHIT" + script "../main.nf" + process "MEGAHIT" + + tag "modules" + tag "modules_nfcore" + tag "megahit" + + test("sarscov2 - fastq - se") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe - coassembly") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true)] , + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/megahit/tests/main.nf.test.snap b/modules/nf-core/megahit/tests/main.nf.test.snap new file mode 100644 index 00000000..4677cc33 --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test.snap @@ -0,0 +1,172 @@ +{ + "sarscov2 - fastq - se": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:42.387947698" + }, + "sarscov2 - fastq - pe": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:48.679485983" + }, + "sarscov2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ], + "addi_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "k_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "kfinal_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "local_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:44:35.245399991" + }, + "sarscov2 - fastq - pe - coassembly": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:56.23363342" + } +} \ No newline at end of file diff --git a/modules/nf-core/megahit/tests/tags.yml b/modules/nf-core/megahit/tests/tags.yml new file mode 100644 index 00000000..9e865846 --- /dev/null +++ b/modules/nf-core/megahit/tests/tags.yml @@ -0,0 +1,2 @@ +megahit: + - "modules/nf-core/megahit/**" diff --git a/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf new file mode 100644 index 00000000..7804ea01 --- /dev/null +++ b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/main.nf @@ -0,0 +1,38 @@ +process METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::metabat2=2.15" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metabat2:2.15--h986a166_1' : + 'biocontainers/metabat2:2.15--h986a166_1' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.txt.gz"), emit: depth + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + export OMP_NUM_THREADS=$task.cpus + + jgi_summarize_bam_contig_depths \\ + --outputDepth ${prefix}.txt \\ + $args \\ + $bam + + bgzip --threads $task.cpus ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metabat2: \$( metabat2 --help 2>&1 | head -n 2 | tail -n 1| sed 's/.*\\:\\([0-9]*\\.[0-9]*\\).*/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/metabat2/jgisummarizebamcontigdepths/meta.yml b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/meta.yml new file mode 100644 index 00000000..ff0ab40e --- /dev/null +++ b/modules/nf-core/metabat2/jgisummarizebamcontigdepths/meta.yml @@ -0,0 +1,50 @@ +name: metabat2_jgisummarizebamcontigdepths +description: Depth computation per contig step of metabat2 +keywords: + - sort + - binning + - depth + - bam + - coverage + - de novo assembly +tools: + - metabat2: + description: Metagenome binning + homepage: https://bitbucket.org/berkeleylab/metabat/src/master/ + documentation: https://bitbucket.org/berkeleylab/metabat/src/master/ + tool_dev_url: https://bitbucket.org/berkeleylab/metabat/src/master/ + doi: "10.7717/peerj.7359" + licence: ["BSD-3-clause-LBNL"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file of reads aligned on the assembled contigs + pattern: "*.bam" + - bai: + type: file + description: BAM index file + pattern: "*.bam.bai" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - depth: + type: file + description: Text file listing the coverage per contig + pattern: ".txt.gz" + +authors: + - "@maxibor" diff --git a/modules/nf-core/metabat2/metabat2/main.nf b/modules/nf-core/metabat2/metabat2/main.nf new file mode 100644 index 00000000..7cbee678 --- /dev/null +++ b/modules/nf-core/metabat2/metabat2/main.nf @@ -0,0 +1,48 @@ +process METABAT2_METABAT2 { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::metabat2=2.15" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metabat2:2.15--h986a166_1' : + 'biocontainers/metabat2:2.15--h986a166_1' }" + + input: + tuple val(meta), path(fasta), path(depth) + + output: + tuple val(meta), path("*.tooShort.fa.gz") , optional:true, emit: tooshort + tuple val(meta), path("*.lowDepth.fa.gz") , optional:true, emit: lowdepth + tuple val(meta), path("*.unbinned.fa.gz") , optional:true, emit: unbinned + tuple val(meta), path("*.tsv.gz") , optional:true, emit: membership + tuple val(meta), path("*[!lowDepth|tooShort|unbinned].fa.gz"), optional:true, emit: fasta + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def decompress_depth = depth ? "gzip -d -f $depth" : "" + def depth_file = depth ? "-a ${depth.baseName}" : "" + """ + $decompress_depth + + metabat2 \\ + $args \\ + -i $fasta \\ + $depth_file \\ + -t $task.cpus \\ + --saveCls \\ + -o ${prefix} + + gzip -cn ${prefix} > ${prefix}.tsv.gz + find . -name "*.fa" -type f | xargs -t -n 1 bgzip -@ ${task.cpus} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metabat2: \$( metabat2 --help 2>&1 | head -n 2 | tail -n 1| sed 's/.*\\:\\([0-9]*\\.[0-9]*\\).*/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/metabat2/metabat2/meta.yml b/modules/nf-core/metabat2/metabat2/meta.yml new file mode 100644 index 00000000..37f80fdf --- /dev/null +++ b/modules/nf-core/metabat2/metabat2/meta.yml @@ -0,0 +1,69 @@ +name: metabat2_metabat2 +description: Metagenome binning of contigs +keywords: + - sort + - binning + - depth + - bam + - coverage + - de novo assembly +tools: + - metabat2: + description: Metagenome binning + homepage: https://bitbucket.org/berkeleylab/metabat/src/master/ + documentation: https://bitbucket.org/berkeleylab/metabat/src/master/ + tool_dev_url: https://bitbucket.org/berkeleylab/metabat/src/master/ + doi: "10.7717/peerj.7359" + licence: ["BSD-3-clause-LBNL"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the assembled contigs + pattern: "*.{fa,fas,fasta,fna,fa.gz,fas.gz,fasta.gz,fna.gz}" + - depth: + type: file + description: | + Optional text file listing the coverage per contig pre-generated + by metabat2_jgisummarizebamcontigdepths + pattern: "*.txt" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fasta: + type: file + description: Bins created from assembled contigs in fasta file + pattern: "*.fa.gz" + - tooshort: + type: file + description: Contigs that did not pass length filtering + pattern: "*.tooShort.fa.gz" + - lowdepth: + type: file + description: Contigs that did not have sufficient depth for binning + pattern: "*.lowDepth.fa.gz" + - unbinned: + type: file + description: Contigs that pass length and depth filtering but could not be binned + pattern: "*.unbinned.fa.gz" + - membership: + type: file + description: cluster memberships as a matrix format. + pattern: "*.tsv.gz" + +authors: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/metaeuk/easypredict/main.nf b/modules/nf-core/metaeuk/easypredict/main.nf new file mode 100644 index 00000000..5caf38f9 --- /dev/null +++ b/modules/nf-core/metaeuk/easypredict/main.nf @@ -0,0 +1,62 @@ +process METAEUK_EASYPREDICT { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::metaeuk=6.a5d39d9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaeuk:6.a5d39d9--pl5321hf1761c0_2': + 'biocontainers/metaeuk:6.a5d39d9--pl5321hf1761c0_2' }" + + input: + tuple val(meta), path(fasta) + path(database) + + output: + tuple val(meta), path("${prefix}.fas") , emit: faa + tuple val(meta), path("${prefix}.codon.fas"), emit: codon + tuple val(meta), path("*.tsv") , emit: tsv + tuple val(meta), path("*.gff") , emit: gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + if [ -d ${database} ]; then + ## if supplying an mmseqs database as a directory, metaeuk requires the basename of the database + DBBASE=`find ${database}/ -name "*.version" -exec sh -c 'file=\$(basename {}); echo \${file%%.*}' \\;` + DB=`echo "${database}/\${DBBASE}"` + else + DB=${database} + fi + + metaeuk easy-predict \\ + ${fasta} \\ + \${DB} \\ + ${prefix} \\ + tmp/ \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaeuk: \$(metaeuk | grep 'Version' | sed 's/metaeuk Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fas + touch ${prefix}.codon.fas + touch ${prefix}.headersMap.tsv + touch ${prefix}.gff + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaeuk: \$(metaeuk | grep 'Version' | sed 's/metaeuk Version: //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaeuk/easypredict/meta.yml b/modules/nf-core/metaeuk/easypredict/meta.yml new file mode 100644 index 00000000..6fe44d0b --- /dev/null +++ b/modules/nf-core/metaeuk/easypredict/meta.yml @@ -0,0 +1,67 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "metaeuk_easypredict" +description: Annotation of eukaryotic metagenomes using MetaEuk +keywords: + - genomics + - annotation + - fasta +tools: + - "metaeuk": + description: "MetaEuk - sensitive, high-throughput gene discovery and annotation for large-scale eukaryotic metagenomics" + homepage: https://github.com/soedinglab/metaeuk + documentation: https://github.com/soedinglab/metaeuk + tool_dev_url: https://github.com/soedinglab/metaeuk + doi: "10.1186/s40168-020-00808-x" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - fasta: + type: file + description: Nucleotide FASTA file for annotation + pattern: "*.{fasta,fa,fasta.gz,fa.gz}" + + - database: + type: file + description: Either a fasta file containing protein sequences, or a directory containing an mmseqs2-formatted protein database + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - faa: + type: file + description: Protein FASTA file containing the exons from the input FASTA file + pattern: "*.{fas}" + + - codon: + type: file + description: Nucleotide FASTA file of protein-coding sequences + pattern: "*.{codon.fas}" + + - tsv: + type: file + description: TSV file containing locations of each protein coding sequence in the input fasta + pattern: "*.headersMap.{tsv}" + + - gff: + type: file + description: Annotation file in GFF format + pattern: "*.{gff}" + +authors: + - "@prototaxites" diff --git a/modules/nf-core/mmseqs/databases/main.nf b/modules/nf-core/mmseqs/databases/main.nf new file mode 100644 index 00000000..a23693c4 --- /dev/null +++ b/modules/nf-core/mmseqs/databases/main.nf @@ -0,0 +1,62 @@ +process MMSEQS_DATABASES { + tag "${database}" + label 'process_medium' + + conda "bioconda::mmseqs2=14.7e284" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mmseqs2:14.7e284--pl5321h6a68c12_2': + 'biocontainers/mmseqs2:14.7e284--pl5321h6a68c12_2' }" + + input: + val database + + output: + path "${prefix}/" , emit: database + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + mmseqs databases \\ + ${database} \\ + ${prefix}/database \\ + tmp/ \\ + --threads ${task.cpus} \\ + --compressed 1 \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: //') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: 'mmseqs_database' + """ + mkdir ${prefix}/ + + touch ${prefix}/database + touch ${prefix}/database.dbtype + touch ${prefix}/database_h + touch ${prefix}/database_h.dbtype + touch ${prefix}/database_h.index + touch ${prefix}/database.index + touch ${prefix}/database.lookup + touch ${prefix}/database_mapping + touch ${prefix}/database.source + touch ${prefix}/database_taxonomy + touch ${prefix}/database.version + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mmseqs: \$(mmseqs | grep 'Version' | sed 's/MMseqs2 Version: /') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mmseqs/databases/meta.yml b/modules/nf-core/mmseqs/databases/meta.yml new file mode 100644 index 00000000..edd093bd --- /dev/null +++ b/modules/nf-core/mmseqs/databases/meta.yml @@ -0,0 +1,34 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "mmseqs_databases" +description: Download an mmseqs-formatted database +keywords: + - database + - indexing + - clustering + - searching +tools: + - "mmseqs": + description: "MMseqs2: ultra fast and sensitive sequence search and clustering suite" + homepage: "https://github.com/soedinglab/MMseqs2" + documentation: "https://mmseqs.com/latest/userguide.pdf" + tool_dev_url: "https://github.com/soedinglab/MMseqs2" + doi: "10.1093/bioinformatics/btw006" + licence: "['GPL v3']" + +input: + - database: + type: string + description: Database available through the mmseqs2 databases interface - see https://github.com/soedinglab/MMseqs2/wiki#downloading-databases for details + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - database: + type: directory + description: Directory containing processed mmseqs database + +authors: + - "@prototaxites" diff --git a/modules/nf-core/nanolyse/environment.yml b/modules/nf-core/nanolyse/environment.yml new file mode 100644 index 00000000..7d738ba9 --- /dev/null +++ b/modules/nf-core/nanolyse/environment.yml @@ -0,0 +1,7 @@ +name: nanolyse +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::nanolyse=1.2.0 diff --git a/modules/nf-core/nanolyse/main.nf b/modules/nf-core/nanolyse/main.nf new file mode 100644 index 00000000..68d5d804 --- /dev/null +++ b/modules/nf-core/nanolyse/main.nf @@ -0,0 +1,34 @@ +process NANOLYSE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/nanolyse:1.2.0--py_0' : + 'biocontainers/nanolyse:1.2.0--py_0' }" + + input: + tuple val(meta), path(fastq) + path fasta + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + path "*.log" , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gunzip -c $fastq | NanoLyse -r $fasta | gzip > ${prefix}.fastq.gz + mv NanoLyse.log ${prefix}.nanolyse.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanolyse: \$(NanoLyse --version 2>&1 | sed -e "s/NanoLyse //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/nanolyse/meta.yml b/modules/nf-core/nanolyse/meta.yml new file mode 100644 index 00000000..375ad9bc --- /dev/null +++ b/modules/nf-core/nanolyse/meta.yml @@ -0,0 +1,49 @@ +name: nanolyse +description: DNA contaminant removal using NanoLyse +keywords: + - contaminant_removal +tools: + - nanolyse: + description: | + DNA contaminant removal using NanoLyse + homepage: https://github.com/wdecoster/nanolyse + documentation: https://github.com/wdecoster/nanolyse#nanolyse + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + Basecalled reads in FASTQ.GZ format + pattern: "*.fastq.gz" + - fasta: + type: file + description: | + A reference fasta file against which to filter. + pattern: "*.fasta" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: Reads with contaminants removed in FASTQ format + pattern: "*.fastq.gz" + - log: + type: file + description: Log of the Nanolyse run. + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" +maintainers: + - "@yuukiiwa" diff --git a/modules/nf-core/nanoplot/environment.yml b/modules/nf-core/nanoplot/environment.yml new file mode 100644 index 00000000..219cd2e3 --- /dev/null +++ b/modules/nf-core/nanoplot/environment.yml @@ -0,0 +1,7 @@ +name: nanoplot +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::nanoplot=1.41.6 diff --git a/modules/nf-core/nanoplot/main.nf b/modules/nf-core/nanoplot/main.nf new file mode 100644 index 00000000..c1816caf --- /dev/null +++ b/modules/nf-core/nanoplot/main.nf @@ -0,0 +1,58 @@ +process NANOPLOT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/nanoplot:1.41.6--pyhdfd78af_0' : + 'biocontainers/nanoplot:1.41.6--pyhdfd78af_0' }" + + input: + tuple val(meta), path(ontfile) + + output: + tuple val(meta), path("*.html") , emit: html + tuple val(meta), path("*.png") , optional: true, emit: png + tuple val(meta), path("*.txt") , emit: txt + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def input_file = ("$ontfile".endsWith(".fastq.gz") || "$ontfile".endsWith(".fq.gz")) ? "--fastq ${ontfile}" : + ("$ontfile".endsWith(".txt")) ? "--summary ${ontfile}" : '' + """ + NanoPlot \\ + $args \\ + -t $task.cpus \\ + $input_file + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoplot: \$(echo \$(NanoPlot --version 2>&1) | sed 's/^.*NanoPlot //; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + touch LengthvsQualityScatterPlot_dot.html + touch LengthvsQualityScatterPlot_kde.html + touch NanoPlot-report.html + touch NanoPlot_20240301_1130.log + touch NanoStats.txt + touch Non_weightedHistogramReadlength.html + touch Non_weightedLogTransformed_HistogramReadlength.html + touch WeightedHistogramReadlength.html + touch WeightedLogTransformed_HistogramReadlength.html + touch Yield_By_Length.html + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoplot: \$(echo \$(NanoPlot --version 2>&1) | sed 's/^.*NanoPlot //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/nanoplot/meta.yml b/modules/nf-core/nanoplot/meta.yml new file mode 100644 index 00000000..46fbd562 --- /dev/null +++ b/modules/nf-core/nanoplot/meta.yml @@ -0,0 +1,62 @@ +name: nanoplot +description: Run NanoPlot on nanopore-sequenced reads +keywords: + - quality control + - qc + - fastq + - sequencing summary + - nanopore +tools: + - nanoplot: + description: | + NanoPlot is a tool for ploting long-read sequencing data and + alignment. + homepage: http://nanoplot.bioinf.be + documentation: https://github.com/wdecoster/NanoPlot + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + List of input basecalled-FastQ files. + - summary_txt: + type: file + description: | + List of sequencing_summary.txt files from running basecalling. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: NanoPlot report + pattern: "*{.html}" + - png: + type: file + description: Plots generated by NanoPlot + pattern: "*{.png}" + - txt: + type: file + description: Stats from NanoPlot + pattern: "*{.txt}" + - log: + type: file + description: log file of NanoPlot run + pattern: "*{.log}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@yuukiiwa" +maintainers: + - "@drpatelh" + - "@yuukiiwa" diff --git a/modules/nf-core/nanoplot/tests/main.nf.test b/modules/nf-core/nanoplot/tests/main.nf.test new file mode 100644 index 00000000..29b57c10 --- /dev/null +++ b/modules/nf-core/nanoplot/tests/main.nf.test @@ -0,0 +1,94 @@ +nextflow_process { + + name "Test Process NANOPLOT" + tag "modules_nfcore" + tag "modules" + tag "nanoplot" + script "../main.nf" + process "NANOPLOT" + + test("NanoPlot summary") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.test_data['sarscov2']['nanopore']['test_sequencing_summary'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.txt, + process.out.versions + ).match() + }, + { + with(process.out.html.get(0)) { + assert get(1).collect { p -> file(p).getName() }.contains("NanoPlot-report.html") + } + } + ) + } + + } + + test("NanoPlot FASTQ") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.txt, + process.out.versions + ).match() + }, + { + with(process.out.html.get(0)) { + assert get(1).collect { p -> file(p).getName() }.contains("NanoPlot-report.html") + } + } + ) + } + + } + + test("NanoPlot - stub") { + + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ file(params.test_data['sarscov2']['nanopore']['test_sequencing_summary'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/nf-core/nanoplot/tests/main.nf.test.snap b/modules/nf-core/nanoplot/tests/main.nf.test.snap new file mode 100644 index 00000000..f7f8028a --- /dev/null +++ b/modules/nf-core/nanoplot/tests/main.nf.test.snap @@ -0,0 +1,131 @@ +{ + "NanoPlot - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "LengthvsQualityScatterPlot_dot.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "LengthvsQualityScatterPlot_kde.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "NanoPlot-report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Non_weightedHistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Non_weightedLogTransformed_HistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "WeightedHistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "WeightedLogTransformed_HistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Yield_By_Length.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "NanoStats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "NanoPlot_20240301_1130.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,961cee64736aeb9e56b65d05ee3cd1a5" + ], + "html": [ + [ + { + "id": "test" + }, + [ + "LengthvsQualityScatterPlot_dot.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "LengthvsQualityScatterPlot_kde.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "NanoPlot-report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Non_weightedHistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Non_weightedLogTransformed_HistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "WeightedHistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "WeightedLogTransformed_HistogramReadlength.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "Yield_By_Length.html:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "log": [ + [ + { + "id": "test" + }, + "NanoPlot_20240301_1130.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "png": [ + + ], + "txt": [ + [ + { + "id": "test" + }, + "NanoStats.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,961cee64736aeb9e56b65d05ee3cd1a5" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-03-01T14:54:18.083198" + }, + "NanoPlot FASTQ": { + "content": [ + [ + [ + { + "id": "test" + }, + "NanoStats.txt:md5,50373c7543e71e3baf040926f0c69ac1" + ] + ], + [ + "versions.yml:md5,961cee64736aeb9e56b65d05ee3cd1a5" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2023-10-17T16:18:44.848688965" + }, + "NanoPlot summary": { + "content": [ + [ + [ + { + "id": "test" + }, + "NanoStats.txt:md5,90464bf7049ca66106de56e7eac23dd4" + ] + ], + [ + "versions.yml:md5,961cee64736aeb9e56b65d05ee3cd1a5" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2023-10-17T16:18:31.104601192" + } +} \ No newline at end of file diff --git a/modules/nf-core/nanoplot/tests/tags.yaml b/modules/nf-core/nanoplot/tests/tags.yaml new file mode 100644 index 00000000..7c6ce3fa --- /dev/null +++ b/modules/nf-core/nanoplot/tests/tags.yaml @@ -0,0 +1,2 @@ +nanoplot: + - modules/nf-core/nanoplot/** diff --git a/modules/nf-core/nanoq/environment.yml b/modules/nf-core/nanoq/environment.yml new file mode 100644 index 00000000..1a95d24e --- /dev/null +++ b/modules/nf-core/nanoq/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::nanoq=0.10.0" diff --git a/modules/nf-core/nanoq/main.nf b/modules/nf-core/nanoq/main.nf new file mode 100644 index 00000000..6d35a407 --- /dev/null +++ b/modules/nf-core/nanoq/main.nf @@ -0,0 +1,49 @@ +process NANOQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/nanoq:0.10.0--h031d066_2' : + 'biocontainers/nanoq:0.10.0--h031d066_2'}" + + input: + tuple val(meta), path(ontreads) + val(output_format) //One of the following: fastq, fastq.gz, fastq.bz2, fastq.lzma, fasta, fasta.gz, fasta.bz2, fasta.lzma. + + output: + tuple val(meta), path("*.{stats,json}") , emit: stats + tuple val(meta), path("*_filtered.${output_format}") , emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_filtered" + """ + nanoq -i $ontreads \\ + ${args} \\ + -r ${prefix}.stats \\ + -o ${prefix}.$output_format + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoq: \$(nanoq --version | sed -e 's/nanoq //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}_filtered" + """ + echo "" | gzip > ${prefix}.$output_format + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + nanoq: \$(nanoq --version | sed -e 's/nanoq //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/nanoq/meta.yml b/modules/nf-core/nanoq/meta.yml new file mode 100644 index 00000000..0ff2b9b4 --- /dev/null +++ b/modules/nf-core/nanoq/meta.yml @@ -0,0 +1,63 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "nanoq" +description: Nanoq implements ultra-fast read filters and summary reports for high-throughput + nanopore reads. +keywords: + - nanoq + - Read filters + - Read trimming + - Read report +tools: + - "nanoq": + description: "Ultra-fast quality control and summary reports for nanopore reads" + homepage: "https://github.com/esteinig/nanoq" + documentation: "https://github.com/esteinig/nanoq" + tool_dev_url: "https://github.com/esteinig/nanoq" + doi: "10.21105/joss.02991" + licence: ["MIT"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ontreads: + type: file + description: Compressed or uncompressed nanopore reads in fasta or fastq formats. + pattern: "*.{fa,fna,faa,fasta,fq,fastq}{,.gz,.bz2,.xz}" + - - output_format: + type: string + description: "Specifies the output format. One of these formats: fasta, fastq; + fasta.gz, fastq.gz; fasta.bz2, fastq.bz2; fasta.lzma, fastq.lzma." +output: + - stats: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.{stats,json}": + type: file + description: Summary report of reads statistics. + pattern: "*.{stats,json}" + - reads: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_filtered.${output_format}": + type: file + description: Filtered reads. + pattern: "*.{fasta,fastq}{,.gz,.bz2,.lzma}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@LilyAnderssonLee" +maintainers: + - "@LilyAnderssonLee" diff --git a/modules/nf-core/nanoq/tests/main.nf.test b/modules/nf-core/nanoq/tests/main.nf.test new file mode 100644 index 00000000..ef63d12f --- /dev/null +++ b/modules/nf-core/nanoq/tests/main.nf.test @@ -0,0 +1,122 @@ +nextflow_process { + + name "Test Process NANOQ" + script "../main.nf" + process "NANOQ" + + tag "modules" + tag "modules_nfcore" + tag "nanoq" + + test("sarscov2 - nanopore_uncompressed") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + + input[1] = 'fastq' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - nanopore_compressed_gz") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = 'fastq.gz' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + test("sarscov2 - nanopore_compressed_bz2") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = 'fastq.bz2' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + test("sarscov2 - nanopore_compressed_lzma") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = 'fastq.lzma' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 - nanopore_compressed_gz - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + input[1] = 'fastq.gz' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/nf-core/nanoq/tests/main.nf.test.snap b/modules/nf-core/nanoq/tests/main.nf.test.snap new file mode 100644 index 00000000..b5dda2a7 --- /dev/null +++ b/modules/nf-core/nanoq/tests/main.nf.test.snap @@ -0,0 +1,267 @@ +{ + "sarscov2 - nanopore_compressed_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + "2": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "versions": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-11T11:39:32.117229" + }, + "sarscov2 - nanopore_compressed_gz - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "2": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-11T11:42:06.039307" + }, + "sarscov2 - nanopore_compressed_bz2": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.bz2:md5,b53cf14fd4eb5b16c459c41f03cc8a4b" + ] + ], + "2": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.bz2:md5,b53cf14fd4eb5b16c459c41f03cc8a4b" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "versions": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-11T11:39:36.674647" + }, + "sarscov2 - nanopore_compressed_lzma": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.lzma:md5,65dda701689f913734dc245b68c89e07" + ] + ], + "2": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq.lzma:md5,65dda701689f913734dc245b68c89e07" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "versions": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-11T11:39:41.51344" + }, + "sarscov2 - nanopore_uncompressed": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + "2": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.fastq:md5,7567d853ada6ac142332619d0b541d76" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test_filtered.stats:md5,5ab32af3352dfeca8268e10edf6e4dbe" + ] + ], + "versions": [ + "versions.yml:md5,7a40efe417ff7dbb9e91e9c1629a04e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-11T11:39:26.868897" + } +} \ No newline at end of file diff --git a/modules/nf-core/nanoq/tests/tags.yml b/modules/nf-core/nanoq/tests/tags.yml new file mode 100644 index 00000000..37457df1 --- /dev/null +++ b/modules/nf-core/nanoq/tests/tags.yml @@ -0,0 +1,2 @@ +nanoq: + - "modules/nf-core/nanoq/**" diff --git a/modules/nf-core/porechop/abi/environment.yml b/modules/nf-core/porechop/abi/environment.yml new file mode 100644 index 00000000..dabb4921 --- /dev/null +++ b/modules/nf-core/porechop/abi/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::porechop_abi=0.5.0 diff --git a/modules/nf-core/porechop/abi/main.nf b/modules/nf-core/porechop/abi/main.nf new file mode 100644 index 00000000..88ec5bd0 --- /dev/null +++ b/modules/nf-core/porechop/abi/main.nf @@ -0,0 +1,50 @@ +process PORECHOP_ABI { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop_abi:0.5.0--py310h590eda1_0': + 'biocontainers/porechop_abi:0.5.0--py310h590eda1_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz") , emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.porechop_abi" + if ("$reads" == "${prefix}.fastq.gz") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + porechop_abi \\ + --input $reads \\ + --threads $task.cpus \\ + $args \\ + --output ${prefix}.fastq.gz \\ + | tee ${prefix}.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop_abi: \$( porechop_abi --version ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}.porechop_abi" + """ + echo "" | gzip > ${prefix}.fastq.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop_abi: \$( porechop_abi --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/porechop/abi/meta.yml b/modules/nf-core/porechop/abi/meta.yml new file mode 100644 index 00000000..a856ffbe --- /dev/null +++ b/modules/nf-core/porechop/abi/meta.yml @@ -0,0 +1,48 @@ +name: "porechop_abi" +description: Extension of Porechop whose purpose is to process adapter sequences in ONT reads. +keywords: + - porechop_abi + - adapter + - nanopore +tools: + - "porechop_abi": + description: Extension of Porechop whose purpose is to process adapter sequences in ONT reads. + homepage: "https://github.com/bonsai-team/Porechop_ABI" + documentation: "https://github.com/bonsai-team/Porechop_ABI" + tool_dev_url: "https://github.com/bonsai-team/Porechop_ABI" + doi: "10.1101/2022.07.07.499093" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Adapter-trimmed fastq.gz file + pattern: "*.fastq.gz" + - log: + type: file + description: Log file containing stdout information + pattern: "*.log" +authors: + - "@sofstam" + - "LilyAnderssonLee" +maintainers: + - "@sofstam" + - "LilyAnderssonLee" diff --git a/modules/nf-core/porechop/abi/tests/main.nf.test b/modules/nf-core/porechop/abi/tests/main.nf.test new file mode 100644 index 00000000..b5a29f90 --- /dev/null +++ b/modules/nf-core/porechop/abi/tests/main.nf.test @@ -0,0 +1,59 @@ +nextflow_process { + + name "Test Process PORECHOP_ABI" + script "../main.nf" + process "PORECHOP_ABI" + tag "modules" + tag "modules_nfcore" + tag "porechop" + tag "porechop/abi" + + test("sarscov2-nanopore") { + + when { + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.reads, + file(process.out.log.get(0).get(1)).readLines()[20..40], + process.out.versions).match() + } + ) + } + } + + test("sarscov2-nanopore - stub") { + + options "-stub" + + when { + + process { + """ + input[0] = [ + [ id:'test'], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/porechop/abi/tests/main.nf.test.snap b/modules/nf-core/porechop/abi/tests/main.nf.test.snap new file mode 100644 index 00000000..ad63f4ed --- /dev/null +++ b/modules/nf-core/porechop/abi/tests/main.nf.test.snap @@ -0,0 +1,94 @@ +{ + "sarscov2-nanopore": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,886fdb859fb50e0dddd35007bcff043e" + ] + ], + [ + " Best \u001b[0m", + " read Best \u001b[0m", + " start read end\u001b[0m", + " \u001b[4mSet %ID %ID \u001b[0m", + " \u001b[32mSQK-NSK007 100.0 73.1\u001b[0m", + " Rapid 40.4 0.0", + " RBK004_upstream 77.5 0.0", + " SQK-MAP006 75.8 72.7", + " SQK-MAP006 short 65.5 66.7", + " PCR adapters 1 73.9 69.6", + " PCR adapters 2 80.0 72.7", + " PCR adapters 3 70.8 69.6", + " 1D^2 part 1 71.4 70.0", + " 1D^2 part 2 84.8 75.8", + " cDNA SSP 63.0 61.7", + " \u001b[32mBarcode 1 (reverse) 100.0 100.0\u001b[0m", + " Barcode 2 (reverse) 70.8 69.2", + " Barcode 3 (reverse) 76.0 70.4", + " Barcode 4 (reverse) 74.1 71.4", + " Barcode 5 (reverse) 77.8 80.8", + " Barcode 6 (reverse) 73.1 70.8" + ], + [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-29T13:50:49.318599" + }, + "sarscov2-nanopore - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.porechop_abi.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ], + "log": [ + [ + { + "id": "test" + }, + "test.porechop_abi.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "reads": [ + [ + { + "id": "test" + }, + "test.porechop_abi.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,0e9e5e0d35a68ff8e6490c949b257f98" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.1" + }, + "timestamp": "2024-07-29T13:50:54.425389" + } +} \ No newline at end of file diff --git a/modules/nf-core/porechop/porechop/environment.yml b/modules/nf-core/porechop/porechop/environment.yml new file mode 100644 index 00000000..28b67c16 --- /dev/null +++ b/modules/nf-core/porechop/porechop/environment.yml @@ -0,0 +1,7 @@ +name: porechop_porechop +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::porechop=0.2.4 diff --git a/modules/nf-core/porechop/porechop/main.nf b/modules/nf-core/porechop/porechop/main.nf new file mode 100644 index 00000000..1ff02a12 --- /dev/null +++ b/modules/nf-core/porechop/porechop/main.nf @@ -0,0 +1,48 @@ +process PORECHOP_PORECHOP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' : + 'biocontainers/porechop:0.2.4--py39h7cff6ad_2' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + porechop \\ + -i $reads \\ + -t $task.cpus \\ + $args \\ + -o ${prefix}.fastq.gz \\ + > ${prefix}.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fastq + gzip ${prefix}.fastq + touch ${prefix}.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/porechop/porechop/meta.yml b/modules/nf-core/porechop/porechop/meta.yml new file mode 100644 index 00000000..13be76f2 --- /dev/null +++ b/modules/nf-core/porechop/porechop/meta.yml @@ -0,0 +1,62 @@ +name: "porechop_porechop" +description: Adapter removal and demultiplexing of Oxford Nanopore reads +keywords: + - adapter + - nanopore + - demultiplexing +tools: + - porechop: + description: Adapter removal and demultiplexing of Oxford Nanopore reads + homepage: "https://github.com/rrwick/Porechop" + documentation: "https://github.com/rrwick/Porechop" + tool_dev_url: "https://github.com/rrwick/Porechop" + doi: "10.1099/mgen.0.000132" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Demultiplexed and/or adapter-trimmed fastq.gz file + pattern: "*.{fastq.gz}" + - log: + type: file + description: Log file containing stdout information + pattern: "*.log" +authors: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" + - "@jfy133" +maintainers: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" + - "@jfy133" diff --git a/modules/nf-core/porechop/porechop/tests/main.nf.test b/modules/nf-core/porechop/porechop/tests/main.nf.test new file mode 100644 index 00000000..4c3c3d65 --- /dev/null +++ b/modules/nf-core/porechop/porechop/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + + name "Test Process PORECHOP_PORECHOP" + script "../main.nf" + process "PORECHOP_PORECHOP" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "porechop" + tag "porechop/porechop" + + test("sarscov2 - nanopore - fastq") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], + file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match("reads") }, + { assert snapshot(process.out.versions).match("versions") }, + // complete log is not stable. These first lines should be stable + { assert snapshot(path(process.out.log.get(0).get(1)).readLines()[0..7]).match("log")} + ) + } + + } + + + test("stub") { + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + +} diff --git a/modules/nf-core/porechop/porechop/tests/main.nf.test.snap b/modules/nf-core/porechop/porechop/tests/main.nf.test.snap new file mode 100644 index 00000000..cf544d2d --- /dev/null +++ b/modules/nf-core/porechop/porechop/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,712c0753b56d0fb530092dfb5bdf2e5c" + ] + ], + "timestamp": "2023-12-18T07:47:16.83444" + }, + "log": { + "content": [ + [ + "", + "\u001b[1m\u001b[4mLoading reads\u001b[0m", + "test.fastq.gz", + "100 reads loaded", + "", + "", + "\u001b[1m\u001b[4mLooking for known adapter sets\u001b[0m", + "" + ] + ], + "timestamp": "2023-12-18T07:47:16.853899" + }, + "reads": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test_porechop.fastq.gz:md5,886fdb859fb50e0dddd35007bcff043e" + ] + ] + ], + "timestamp": "2023-12-18T07:47:16.811393" + }, + "stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_porechop.fastq.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_porechop.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,712c0753b56d0fb530092dfb5bdf2e5c" + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test_porechop.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_porechop.fastq.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,712c0753b56d0fb530092dfb5bdf2e5c" + ] + } + ], + "timestamp": "2023-12-18T07:47:37.814949" + } +} \ No newline at end of file diff --git a/modules/nf-core/porechop/porechop/tests/nextflow.config b/modules/nf-core/porechop/porechop/tests/nextflow.config new file mode 100644 index 00000000..a9ecf7b6 --- /dev/null +++ b/modules/nf-core/porechop/porechop/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + + withName: PORECHOP_PORECHOP { + ext.args = '' + ext.prefix = { "${meta.id}_porechop" } + } + +} diff --git a/modules/nf-core/porechop/porechop/tests/tags.yml b/modules/nf-core/porechop/porechop/tests/tags.yml new file mode 100644 index 00000000..743645c2 --- /dev/null +++ b/modules/nf-core/porechop/porechop/tests/tags.yml @@ -0,0 +1,2 @@ +porechop/porechop: + - "modules/nf-core/porechop/porechop/**" diff --git a/modules/nf-core/prodigal/environment.yml b/modules/nf-core/prodigal/environment.yml new file mode 100644 index 00000000..7609bf3b --- /dev/null +++ b/modules/nf-core/prodigal/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::prodigal=2.6.3 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/prodigal/main.nf b/modules/nf-core/prodigal/main.nf new file mode 100644 index 00000000..49ced167 --- /dev/null +++ b/modules/nf-core/prodigal/main.nf @@ -0,0 +1,64 @@ +process PRODIGAL { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' : + 'biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" + + input: + tuple val(meta), path(genome) + val(output_format) + + output: + tuple val(meta), path("${prefix}.${output_format}.gz"), emit: gene_annotations + tuple val(meta), path("${prefix}.fna.gz"), emit: nucleotide_fasta + tuple val(meta), path("${prefix}.faa.gz"), emit: amino_acid_fasta + tuple val(meta), path("${prefix}_all.txt.gz"), emit: all_gene_annotations + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + pigz -cdf ${genome} | prodigal \\ + $args \\ + -f $output_format \\ + -d "${prefix}.fna" \\ + -o "${prefix}.${output_format}" \\ + -a "${prefix}.faa" \\ + -s "${prefix}_all.txt" + + pigz -nm ${prefix}.fna + pigz -nm ${prefix}.${output_format} + pigz -nm ${prefix}.faa + pigz -nm ${prefix}_all.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prodigal: \$(prodigal -v 2>&1 | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p') + pigz: \$(pigz -V 2>&1 | sed 's/pigz //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.fna.gz + touch ${prefix}.${output_format}.gz + touch ${prefix}.faa.gz + touch ${prefix}_all.txt.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prodigal: \$(prodigal -v 2>&1 | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p') + pigz: \$(pigz -V 2>&1 | sed 's/pigz //g') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/prodigal/meta.yml b/modules/nf-core/prodigal/meta.yml new file mode 100644 index 00000000..7d3d459e --- /dev/null +++ b/modules/nf-core/prodigal/meta.yml @@ -0,0 +1,79 @@ +name: prodigal +description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a + microbial (bacterial and archaeal) gene finding program +keywords: + - prokaryotes + - gene finding + - microbial +tools: + - prodigal: + description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) + is a microbial (bacterial and archaeal) gene finding program + homepage: https://github.com/hyattpd/Prodigal + documentation: https://github.com/hyattpd/prodigal/wiki + tool_dev_url: https://github.com/hyattpd/Prodigal + doi: "10.1186/1471-2105-11-119" + licence: ["GPL v3"] + identifier: biotools:prodigal +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genome: + type: file + description: fasta/fasta.gz file + - - output_format: + type: string + description: Output format ("gbk"/"gff"/"sqn"/"sco") +output: + - gene_annotations: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.${output_format}.gz: + type: file + description: gene annotations in output_format given as input + pattern: "*.{output_format}" + - nucleotide_fasta: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.fna.gz: + type: file + description: nucleotide sequences file + pattern: "*.{fna}" + - amino_acid_fasta: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.faa.gz: + type: file + description: protein translations file + pattern: "*.{faa}" + - all_gene_annotations: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}_all.txt.gz: + type: file + description: complete starts file + pattern: "*.{_all.txt}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@grst" +maintainers: + - "@grst" diff --git a/modules/nf-core/prodigal/tests/main.nf.test b/modules/nf-core/prodigal/tests/main.nf.test new file mode 100644 index 00000000..446bd0d1 --- /dev/null +++ b/modules/nf-core/prodigal/tests/main.nf.test @@ -0,0 +1,101 @@ +nextflow_process { + + name "Test Process PRODIGAL" + script "../main.nf" + process "PRODIGAL" + + tag "modules" + tag "modules_nfcore" + tag "prodigal" + + test("prodigal - sarscov2 - gff") { + when { + process { + """ + input[0] = [ + [id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'gff' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("prodigal - sarscov2 - gbk") { + when { + process { + """ + input[0] = [ + [id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'gbk' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("prodigal - sarscov2 - gff - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'gff' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.out).match() } + ) + } + } + + test("prodigal - sarscov2 - gbk - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[1] = 'gbk' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.out).match() } + ) + } + } + +} \ No newline at end of file diff --git a/modules/nf-core/prodigal/tests/main.nf.test.snap b/modules/nf-core/prodigal/tests/main.nf.test.snap new file mode 100644 index 00000000..f29802b4 --- /dev/null +++ b/modules/nf-core/prodigal/tests/main.nf.test.snap @@ -0,0 +1,196 @@ +{ + "prodigal - sarscov2 - gbk - stub": { + "content": null, + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T13:58:09.852618454" + }, + "prodigal - sarscov2 - gff": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff.gz:md5,612c2724c2891c63350f171f74165757" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fna.gz:md5,1bc8a05bcb72a3c324f5e4ffaa716d3b" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.faa.gz:md5,7168b854103f3586ccfdb71a44c389f7" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test_all.txt.gz:md5,e6d6c50f0c39e5169f84ae3c90837fa9" + ] + ], + "4": [ + "versions.yml:md5,9541e53a6927e9856036bb97bfb30307" + ], + "all_gene_annotations": [ + [ + { + "id": "test", + "single_end": false + }, + "test_all.txt.gz:md5,e6d6c50f0c39e5169f84ae3c90837fa9" + ] + ], + "amino_acid_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.faa.gz:md5,7168b854103f3586ccfdb71a44c389f7" + ] + ], + "gene_annotations": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff.gz:md5,612c2724c2891c63350f171f74165757" + ] + ], + "nucleotide_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fna.gz:md5,1bc8a05bcb72a3c324f5e4ffaa716d3b" + ] + ], + "versions": [ + "versions.yml:md5,9541e53a6927e9856036bb97bfb30307" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T13:57:49.57989696" + }, + "prodigal - sarscov2 - gff - stub": { + "content": null, + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T13:58:03.210222528" + }, + "prodigal - sarscov2 - gbk": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gbk.gz:md5,188b3a0e3f78740ded7f3ec4d876cb4b" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fna.gz:md5,1bc8a05bcb72a3c324f5e4ffaa716d3b" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.faa.gz:md5,7168b854103f3586ccfdb71a44c389f7" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test_all.txt.gz:md5,e6d6c50f0c39e5169f84ae3c90837fa9" + ] + ], + "4": [ + "versions.yml:md5,9541e53a6927e9856036bb97bfb30307" + ], + "all_gene_annotations": [ + [ + { + "id": "test", + "single_end": false + }, + "test_all.txt.gz:md5,e6d6c50f0c39e5169f84ae3c90837fa9" + ] + ], + "amino_acid_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.faa.gz:md5,7168b854103f3586ccfdb71a44c389f7" + ] + ], + "gene_annotations": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gbk.gz:md5,188b3a0e3f78740ded7f3ec4d876cb4b" + ] + ], + "nucleotide_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.fna.gz:md5,1bc8a05bcb72a3c324f5e4ffaa716d3b" + ] + ], + "versions": [ + "versions.yml:md5,9541e53a6927e9856036bb97bfb30307" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-18T13:57:56.606374214" + } +} \ No newline at end of file diff --git a/modules/nf-core/prodigal/tests/tags.yml b/modules/nf-core/prodigal/tests/tags.yml new file mode 100644 index 00000000..fc0cb020 --- /dev/null +++ b/modules/nf-core/prodigal/tests/tags.yml @@ -0,0 +1,2 @@ +prodigal: + - "modules/nf-core/prodigal/**" diff --git a/modules/nf-core/prokka/main.nf b/modules/nf-core/prokka/main.nf new file mode 100644 index 00000000..60fbe232 --- /dev/null +++ b/modules/nf-core/prokka/main.nf @@ -0,0 +1,52 @@ +process PROKKA { + tag "$meta.id" + label 'process_low' + + conda "bioconda::prokka=1.14.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prokka%3A1.14.6--pl5321hdfd78af_4' : + 'biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" + + input: + tuple val(meta), path(fasta) + path proteins + path prodigal_tf + + output: + tuple val(meta), path("${prefix}/*.gff"), emit: gff + tuple val(meta), path("${prefix}/*.gbk"), emit: gbk + tuple val(meta), path("${prefix}/*.fna"), emit: fna + tuple val(meta), path("${prefix}/*.faa"), emit: faa + tuple val(meta), path("${prefix}/*.ffn"), emit: ffn + tuple val(meta), path("${prefix}/*.sqn"), emit: sqn + tuple val(meta), path("${prefix}/*.fsa"), emit: fsa + tuple val(meta), path("${prefix}/*.tbl"), emit: tbl + tuple val(meta), path("${prefix}/*.err"), emit: err + tuple val(meta), path("${prefix}/*.log"), emit: log + tuple val(meta), path("${prefix}/*.txt"), emit: txt + tuple val(meta), path("${prefix}/*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def proteins_opt = proteins ? "--proteins ${proteins[0]}" : "" + def prodigal_tf = prodigal_tf ? "--prodigaltf ${prodigal_tf[0]}" : "" + """ + prokka \\ + $args \\ + --cpus $task.cpus \\ + --prefix $prefix \\ + $proteins_opt \\ + $prodigal_tf \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prokka: \$(echo \$(prokka --version 2>&1) | sed 's/^.*prokka //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/prokka/meta.yml b/modules/nf-core/prokka/meta.yml new file mode 100644 index 00000000..7fc9e185 --- /dev/null +++ b/modules/nf-core/prokka/meta.yml @@ -0,0 +1,91 @@ +name: prokka +description: Whole genome annotation of small genomes (bacterial, archeal, viral) +keywords: + - annotation + - fasta + - prokka +tools: + - prokka: + description: Rapid annotation of prokaryotic genomes + homepage: https://github.com/tseemann/prokka + doi: "10.1093/bioinformatics/btu153" + licence: ["GPL v2"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + FASTA file to be annotated. Has to contain at least a non-empty string dummy value. + - proteins: + type: file + description: FASTA file of trusted proteins to first annotate from (optional) + - prodigal_tf: + type: file + description: Training file to use for Prodigal (optional) + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - gff: + type: file + description: annotation in GFF3 format, containing both sequences and annotations + pattern: "*.{gff}" + - gbk: + type: file + description: annotation in GenBank format, containing both sequences and annotations + pattern: "*.{gbk}" + - fna: + type: file + description: nucleotide FASTA file of the input contig sequences + pattern: "*.{fna}" + - faa: + type: file + description: protein FASTA file of the translated CDS sequences + pattern: "*.{faa}" + - ffn: + type: file + description: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) + pattern: "*.{ffn}" + - sqn: + type: file + description: an ASN1 format "Sequin" file for submission to Genbank + pattern: "*.{sqn}" + - fsa: + type: file + description: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file + pattern: "*.{fsa}" + - tbl: + type: file + description: feature Table file, used by "tbl2asn" to create the .sqn file + pattern: "*.{tbl}" + - err: + type: file + description: unacceptable annotations - the NCBI discrepancy report. + pattern: "*.{err}" + - log: + type: file + description: contains all the output that Prokka produced during its run + pattern: "*.{log}" + - txt: + type: file + description: statistics relating to the annotated features found + pattern: "*.{txt}" + - tsv: + type: file + description: tab-separated file of all features (locus_tag,ftype,len_bp,gene,EC_number,COG,product) + pattern: "*.{tsv}" + +authors: + - "@rpetit3" diff --git a/modules/nf-core/pydamage/analyze/main.nf b/modules/nf-core/pydamage/analyze/main.nf new file mode 100644 index 00000000..03cbe62a --- /dev/null +++ b/modules/nf-core/pydamage/analyze/main.nf @@ -0,0 +1,35 @@ +process PYDAMAGE_ANALYZE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::pydamage=0.70" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' : + 'biocontainers/pydamage:0.70--pyhdfd78af_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("pydamage_results/pydamage_results.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + pydamage \\ + analyze \\ + $args \\ + -p $task.cpus \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pydamage: \$(echo \$(pydamage --version 2>&1) | sed -e 's/pydamage, version //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/pydamage/analyze/meta.yml b/modules/nf-core/pydamage/analyze/meta.yml new file mode 100644 index 00000000..09dd25eb --- /dev/null +++ b/modules/nf-core/pydamage/analyze/meta.yml @@ -0,0 +1,55 @@ +name: pydamage_analyze +description: Damage parameter estimation for ancient DNA +keywords: + - ancient DNA + - aDNA + - de novo assembly + - filtering + - damage + - deamination + - miscoding lesions + - C to T + - palaeogenomics + - archaeogenomics + - palaeogenetics + - archaeogenetics +tools: + - pydamage: + description: Damage parameter estimation for ancient DNA + homepage: https://github.com/maxibor/pydamage + documentation: https://pydamage.readthedocs.io/ + tool_dev_url: https://github.com/maxibor/pydamage + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csv: + type: file + description: PyDamage results as csv files + pattern: "*.csv" + +authors: + - "@maxibor" diff --git a/modules/nf-core/pydamage/filter/main.nf b/modules/nf-core/pydamage/filter/main.nf new file mode 100644 index 00000000..59d6e4b9 --- /dev/null +++ b/modules/nf-core/pydamage/filter/main.nf @@ -0,0 +1,35 @@ +process PYDAMAGE_FILTER { + tag "$meta.id" + label 'process_single' + + conda "bioconda::pydamage=0.70" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pydamage:0.70--pyhdfd78af_0' : + 'biocontainers/pydamage:0.70--pyhdfd78af_0' }" + + input: + tuple val(meta), path(csv) + + output: + tuple val(meta), path("pydamage_results/pydamage_filtered_results.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + pydamage \\ + filter \\ + $args \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pydamage: \$(echo \$(pydamage --version 2>&1) | sed -e 's/pydamage, version //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/pydamage/filter/meta.yml b/modules/nf-core/pydamage/filter/meta.yml new file mode 100644 index 00000000..c732ab9b --- /dev/null +++ b/modules/nf-core/pydamage/filter/meta.yml @@ -0,0 +1,51 @@ +name: pydamage_filter +description: Damage parameter estimation for ancient DNA +keywords: + - ancient DNA + - aDNA + - de novo assembly + - filtering + - damage + - deamination + - miscoding lesions + - C to T + - palaeogenomics + - archaeogenomics + - palaeogenetics + - archaeogenetics +tools: + - pydamage: + description: Damage parameter estimation for ancient DNA + homepage: https://github.com/maxibor/pydamage + documentation: https://pydamage.readthedocs.io/ + tool_dev_url: https://github.com/maxibor/pydamage + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: csv file from pydamage analyze + pattern: "*.csv" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csv: + type: file + description: PyDamage filtered results as csv file + pattern: "*.csv" + +authors: + - "@maxibor" diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 00000000..59ed3088 --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 00000000..957b25e5 --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,57 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/seqtk/mergepe/main.nf b/modules/nf-core/seqtk/mergepe/main.nf new file mode 100644 index 00000000..6a4362e5 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/main.nf @@ -0,0 +1,46 @@ +process SEQTK_MERGEPE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::seqtk=1.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' : + 'biocontainers/seqtk:1.3--h5bf99c6_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + ln -s ${reads} ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqtk \\ + mergepe \\ + $args \\ + ${reads} \\ + | gzip -n >> ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqtk/mergepe/meta.yml b/modules/nf-core/seqtk/mergepe/meta.yml new file mode 100644 index 00000000..8248ee09 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/meta.yml @@ -0,0 +1,40 @@ +name: seqtk_mergepe +description: Interleave pair-end reads from FastQ files +keywords: + - interleave +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. Seqtk mergepe command merges pair-end reads into one interleaved file. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data,respectively. + pattern: "*.{fastq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: If single-end reads, the output is the same as the input, 1 FastQ file for each read. If pair-end reads, the read pairs will be interleaved and output as 1 FastQ file for each read pair. + pattern: "*.{fastq.gz}" + +authors: + - "@emnilsson" diff --git a/modules/nf-core/spades/environment.yml b/modules/nf-core/spades/environment.yml new file mode 100644 index 00000000..8cc5321f --- /dev/null +++ b/modules/nf-core/spades/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::spades=4.0.0 diff --git a/modules/nf-core/spades/main.nf b/modules/nf-core/spades/main.nf new file mode 100644 index 00000000..36cdfe44 --- /dev/null +++ b/modules/nf-core/spades/main.nf @@ -0,0 +1,102 @@ +process SPADES { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/spades:4.0.0--h5fb382e_1' : + 'biocontainers/spades:4.0.0--h5fb382e_1' }" + + input: + tuple val(meta), path(illumina), path(pacbio), path(nanopore) + path yml + path hmm + + output: + tuple val(meta), path('*.scaffolds.fa.gz') , optional:true, emit: scaffolds + tuple val(meta), path('*.contigs.fa.gz') , optional:true, emit: contigs + tuple val(meta), path('*.transcripts.fa.gz') , optional:true, emit: transcripts + tuple val(meta), path('*.gene_clusters.fa.gz'), optional:true, emit: gene_clusters + tuple val(meta), path('*.assembly.gfa.gz') , optional:true, emit: gfa + tuple val(meta), path('*.warnings.log') , optional:true, emit: warnings + tuple val(meta), path('*.spades.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def maxmem = task.memory.toGiga() + def illumina_reads = illumina ? ( meta.single_end ? "-s $illumina" : "-1 ${illumina[0]} -2 ${illumina[1]}" ) : "" + def pacbio_reads = pacbio ? "--pacbio $pacbio" : "" + def nanopore_reads = nanopore ? "--nanopore $nanopore" : "" + def custom_hmms = hmm ? "--custom-hmms $hmm" : "" + def reads = yml ? "--dataset $yml" : "$illumina_reads $pacbio_reads $nanopore_reads" + """ + spades.py \\ + $args \\ + --threads $task.cpus \\ + --memory $maxmem \\ + $custom_hmms \\ + $reads \\ + -o ./ + mv spades.log ${prefix}.spades.log + + if [ -f scaffolds.fasta ]; then + mv scaffolds.fasta ${prefix}.scaffolds.fa + gzip -n ${prefix}.scaffolds.fa + fi + if [ -f contigs.fasta ]; then + mv contigs.fasta ${prefix}.contigs.fa + gzip -n ${prefix}.contigs.fa + fi + if [ -f transcripts.fasta ]; then + mv transcripts.fasta ${prefix}.transcripts.fa + gzip -n ${prefix}.transcripts.fa + fi + if [ -f assembly_graph_with_scaffolds.gfa ]; then + mv assembly_graph_with_scaffolds.gfa ${prefix}.assembly.gfa + gzip -n ${prefix}.assembly.gfa + fi + + if [ -f gene_clusters.fasta ]; then + mv gene_clusters.fasta ${prefix}.gene_clusters.fa + gzip -n ${prefix}.gene_clusters.fa + fi + + if [ -f warnings.log ]; then + mv warnings.log ${prefix}.warnings.log + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def maxmem = task.memory.toGiga() + def illumina_reads = illumina ? ( meta.single_end ? "-s $illumina" : "-1 ${illumina[0]} -2 ${illumina[1]}" ) : "" + def pacbio_reads = pacbio ? "--pacbio $pacbio" : "" + def nanopore_reads = nanopore ? "--nanopore $nanopore" : "" + def custom_hmms = hmm ? "--custom-hmms $hmm" : "" + def reads = yml ? "--dataset $yml" : "$illumina_reads $pacbio_reads $nanopore_reads" + """ + echo "" | gzip > ${prefix}.scaffolds.fa.gz + echo "" | gzip > ${prefix}.contigs.fa.gz + echo "" | gzip > ${prefix}.transcripts.fa.gz + echo "" | gzip > ${prefix}.gene_clusters.fa.gz + echo "" | gzip > ${prefix}.assembly.gfa.gz + touch ${prefix}.spades.log + touch ${prefix}.warnings.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/spades/meta.yml b/modules/nf-core/spades/meta.yml new file mode 100644 index 00000000..986871be --- /dev/null +++ b/modules/nf-core/spades/meta.yml @@ -0,0 +1,99 @@ +name: spades +description: Assembles a small genome (bacterial, fungal, viral) +keywords: + - genome + - assembly + - genome assembler + - small genome + - de novo assembler +tools: + - spades: + description: SPAdes (St. Petersburg genome assembler) is intended for both standard isolates and single-cell MDA bacteria assemblies. + homepage: http://cab.spbu.ru/files/release3.15.0/manual.html + documentation: http://cab.spbu.ru/files/release3.15.0/manual.html + tool_dev_url: https://github.com/ablab/spades + doi: 10.1089/cmb.2012.0021 + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - illumina: + type: file + description: | + List of input FastQ (Illumina or PacBio CCS reads) files + of size 1 and 2 for single-end and paired-end data, + respectively. This input data type is required. + - pacbio: + type: file + description: | + List of input PacBio CLR FastQ files of size 1. + - nanopore: + type: file + description: | + List of input FastQ files of size 1, originating from Oxford Nanopore technology. + - yml: + type: file + description: | + Path to yml file containing read information. + The raw FASTQ files listed in this YAML file MUST be supplied to the respective illumina/pacbio/nanopore input channel(s) _in addition_ to this YML. + File entries in this yml must contain only the file name and no paths. + pattern: "*.{yml,yaml}" + - hmm: + type: file + description: File or directory with amino acid HMMs for Spades HMM-guided mode. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - scaffolds: + type: file + description: | + Fasta file containing scaffolds + pattern: "*.fa.gz" + - contigs: + type: file + description: | + Fasta file containing contigs + pattern: "*.fa.gz" + - transcripts: + type: file + description: | + Fasta file containing transcripts + pattern: "*.fa.gz" + - gene_clusters: + type: file + description: | + Fasta file containing gene_clusters + pattern: "*.fa.gz" + - gfa: + type: file + description: | + gfa file containing assembly + pattern: "*.gfa.gz" + - log: + type: file + description: | + Spades log file + pattern: "*.spades.log" + - log: + type: file + description: | + Spades warning log file + pattern: "*.warning.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" +maintainers: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" diff --git a/modules/nf-core/spades/tests/main.nf.test b/modules/nf-core/spades/tests/main.nf.test new file mode 100644 index 00000000..3a93f486 --- /dev/null +++ b/modules/nf-core/spades/tests/main.nf.test @@ -0,0 +1,228 @@ +nextflow_process { + + name "Test Process SPADES" + script "../main.nf" + process "SPADES" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "spades" + + test("sarscov2 - se ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input[1] = [] + input[2] = [] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } } + ) + } + } + + test("sarscov2 - pe ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + + } + // isnt perfect, because CCS reads should rather be used with -s instead of --pacbio + test("sarscov2 - pe - pacbio ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - nanopore ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - nanopore - yml ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = file(params.modules_testdata_base_path + "delete_me/spades/spades_input_yml.yml", checkIfExists: true) + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - hmm ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/viralrecon/illumina/sispa/SRR11140744_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/viralrecon/illumina/sispa/SRR11140744_R2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [file(params.modules_testdata_base_path + "/genomics/sarscov2/genome/proteome.hmm.gz", checkIfExists: true)] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } } + ) + } + } + + test("sarscov2 - pe - stub ") { + options "-stub" + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/spades/tests/main.nf.test.snap b/modules/nf-core/spades/tests/main.nf.test.snap new file mode 100644 index 00000000..e1b3b652 --- /dev/null +++ b/modules/nf-core/spades/tests/main.nf.test.snap @@ -0,0 +1,403 @@ +{ + "sarscov2 - pe - nanopore ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:13:08.663068339" + }, + "sarscov2 - pe - hmm ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,ce077d5f3380690f8d9a5fe188f82128" + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,07136eab8e231f095dc5dd62f1b62a91" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T08:04:19.650636803" + }, + "sarscov2 - pe - pacbio ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:49.305512756" + }, + "sarscov2 - pe ": { + "content": [ + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,70e4a5485dd59566b212a199c31c343b" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,b773132d52be5090cdbdf5a643027093" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:36.161628498" + }, + "sarscov2 - pe - nanopore - yml ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:13:21.868805946" + }, + "sarscov2 - se ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.scaffolds.fa.gz:md5,65ba6a517c152dbe219bf4b5b92bdad7" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,65ba6a517c152dbe219bf4b5b92bdad7" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.assembly.gfa.gz:md5,e4836fdf7104d79e314e3e50986b4bb2" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:16.562778962" + }, + "sarscov2 - pe - stub ": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.transcripts.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gene_clusters.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.spades.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ], + "contigs": [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "gene_clusters": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gene_clusters.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "gfa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.spades.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "scaffolds": [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "transcripts": [ + [ + { + "id": "test", + "single_end": false + }, + "test.transcripts.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ], + "warnings": [ + [ + { + "id": "test", + "single_end": false + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:20:07.195881734" + } +} diff --git a/modules/nf-core/spades/tests/nextflow.config b/modules/nf-core/spades/tests/nextflow.config new file mode 100644 index 00000000..adec1bde --- /dev/null +++ b/modules/nf-core/spades/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: SPADES { + ext.args = '--rnaviral' + } +} diff --git a/modules/nf-core/spades/tests/tags.yml b/modules/nf-core/spades/tests/tags.yml new file mode 100644 index 00000000..035861ff --- /dev/null +++ b/modules/nf-core/spades/tests/tags.yml @@ -0,0 +1,2 @@ +spades: + - "modules/nf-core/spades/**" diff --git a/modules/nf-core/tiara/tiara/main.nf b/modules/nf-core/tiara/tiara/main.nf new file mode 100644 index 00000000..ec28032d --- /dev/null +++ b/modules/nf-core/tiara/tiara/main.nf @@ -0,0 +1,63 @@ +process TIARA_TIARA { + tag "$meta.id" + label 'process_medium' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "conda-forge::tiara=1.0.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tiara:1.0.3' : + 'biocontainers/tiara:1.0.3' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${prefix}.{txt,txt.gz}") , emit: classifications + tuple val(meta), path("log_*.{txt,txt.gz}") , emit: log + tuple val(meta), path("*.{fasta,fasta.gz}") , emit: fasta, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + tiara -i ${fasta} \ + -o ${prefix}.txt \ + --threads ${task.cpus} \ + ${args} + + ## fix gzip flag weirdness and ensure consistent .fasta filename output + ## check if fasta files are being output + if echo "${args}" | grep -qE "tf|to-fasta"; then + ## check if we've asked for gzip output, then rename files consistently + if echo "${args}" | grep -q "gz"; then + find . -name "*_${fasta}*" -exec sh -c 'file=`basename {}`; mv "\$file" "\${file%%_*}_${prefix}.fasta.gz"' \\; + else + find . -name "*_${fasta}*" -exec sh -c 'file=`basename {}`; mv "\$file" "\${file%%_*}_${prefix}.fasta"' \\; + fi + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiara: ${VERSION} + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.0.3' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + touch ${prefix}.out.txt + touch log_${prefix}.out.txt + touch bacteria_${prefix}.fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiara: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/tiara/tiara/meta.yml b/modules/nf-core/tiara/tiara/meta.yml new file mode 100644 index 00000000..687bb63e --- /dev/null +++ b/modules/nf-core/tiara/tiara/meta.yml @@ -0,0 +1,52 @@ +name: "tiara_tiara" +description: Domain-level classification of contigs to bacterial, archaeal, eukaryotic, or organelle +keywords: + - contigs + - metagenomics + - classify +tools: + - "tiara": + description: "Deep-learning-based approach for identification of eukaryotic sequences in the metagenomic data powered by PyTorch." + homepage: "https://ibe-uw.github.io/tiara/" + documentation: https://ibe-uw.github.io/tiara/" + tool_dev_url: "https://github.com/ibe-uw/tiara" + doi: "10.1093/bioinformatics/btab672" + licence: "MIT" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: FASTA file of assembled contigs. + pattern: "*.{fa,fa.gz,fasta,fasta.gz,fna,fna.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classifications: + type: file + description: TSV file containing per-contig classification probabilities and overall classifications. Gzipped if flag --gz is set. + pattern: "*.{txt,txt.gz}" + - log: + type: file + description: Log file containing tiara model parameters. Gzipped if flag --gz is set. + pattern: "log_*.{txt,txt.gz}" + - fasta: + type: file + description: | + (optional) - fasta files for each domain category specified in command flag `-tf`, containing classified contigs + pattern: "*.{fasta,fasta.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@prototaxites" diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 00000000..0c9cbb10 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,11 @@ +name: untar + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.7 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 00000000..8a75bb95 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 00000000..a9a2110f --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 00000000..2a7c97bf --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,47 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + tag "modules" + tag "modules_nfcore" + tag "untar" + test("test_untar") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 00000000..64550292 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,42 @@ +{ + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:41.320643" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:33.795172" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 00000000..feb6f15c --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/nextflow.config b/nextflow.config index ee8d7289..2e915115 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,12 +9,156 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options - input = null + input = null + assembly_input = null + single_end = false + + // short read preprocessing options + skip_clipping = false + clip_tool = 'fastp' + save_clipped_reads = false + reads_minlength = 15 + fastp_save_trimmed_fail = false + fastp_qualified_quality = 15 + fastp_cut_mean_quality = 15 + adapterremoval_minquality = 2 + adapterremoval_adapter1 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG' + adapterremoval_adapter2 = 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT' + adapterremoval_trim_quality_stretch = false + keep_phix = false + // long read preprocessing options + longread_adaptertrimming_tool = "porechop_abi" + longread_filtering_tool = "filtlong" + // phix_reference = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/Enterobacteria_phage_phiX174_sensu_lato/all_assembly_versions/GCA_002596845.1_ASM259684v1/GCA_002596845.1_ASM259684v1_genomic.fna.gz" + phix_reference = "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz" + save_phixremoved_reads = false + host_fasta = null + host_fasta_bowtie2index = null + host_genome = null + host_removal_verysensitive = false + host_removal_save_ids = false + save_hostremoved_reads = false + bbnorm = false + bbnorm_target = 100 + bbnorm_min = 5 + save_bbnorm_reads = false + + // binning options + bowtie2_mode = null + binning_map_mode = 'group' + save_assembly_mapped_reads = false + skip_binning = false + min_contig_size = 1500 + min_length_unbinned_contigs = 1000000 + max_unbinned_contigs = 100 + skip_prokka = false + prokka_with_compliance = false + prokka_compliance_centre = null + + // assembly options + coassemble_group = false + spades_options = null + megahit_options = null + skip_spades = false + skip_spadeshybrid = false + skip_megahit = false + skip_quast = false + skip_prodigal = false + + // virus identification options + run_virus_identification = false + genomad_db = null + genomad_min_score = 0.7 + genomad_splits = 1 + + // ancient DNA assembly validation options + ancient_dna = false + pydamage_accuracy = 0.5 + skip_ancient_damagecorrection = false + freebayes_ploidy = 1 + freebayes_min_basequality = 20 + freebayes_minallelefreq = 0.33 + bcftools_view_high_variant_quality = 30 + bcftools_view_medium_variant_quality = 20 + bcftools_view_minimal_allelesupport = 3 + + // taxonomy options + centrifuge_db = null + kraken2_db = null + skip_krona = false + krona_db = null + cat_db = null + cat_db_generate = false + cat_official_taxonomy = false + save_cat_db = false + skip_gtdbtk = false + gtdb_db = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + gtdb_mash = null + gtdbtk_min_completeness = 50.0 + gtdbtk_max_contamination = 10.0 + gtdbtk_min_perc_aa = 10 + gtdbtk_min_af = 0.65 + gtdbtk_pplacer_cpus = 1 + gtdbtk_pplacer_useram = false + + // long read preprocessing options + skip_adapter_trimming = false + keep_lambda = false + longreads_min_quality = null + longreads_min_length = 1000 + longreads_keep_percent = 90 + longreads_length_weight = 10 + // lambda_reference = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/viral/Escherichia_virus_Lambda/all_assembly_versions/GCA_000840245.1_ViralProj14204/GCA_000840245.1_ViralProj14204_genomic.fna.gz" + lambda_reference = "${baseDir}/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz" + save_lambdaremoved_reads = false + save_porechop_reads = false + save_filtered_longreads = false + + // binning options + skip_metabat2 = false + skip_maxbin2 = false + skip_concoct = false + bin_domain_classification = false + bin_domain_classification_tool = 'tiara' + tiara_min_length = 3000 + refine_bins_dastool = false + refine_bins_dastool_threshold = 0.5 + postbinning_input = 'raw_bins_only' + exclude_unbins_from_postbinning = false + + // Bin QC + skip_binqc = false + binqc_tool = 'busco' + busco_db = null + busco_auto_lineage_prok = false + save_busco_db = false + busco_clean = false + checkm_download_url = "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz" + checkm_db = null + save_checkm_data = false + checkm2_db = null + checkm2_db_version = 5571251 // corresponds to Zenodo record ID + save_checkm2_data = false + run_gunc = false + gunc_database_type = 'progenomes' + gunc_db = null + gunc_save_db = false + + // Reproducibility options + megahit_fix_cpu_1 = false + spades_fix_cpus = -1 + spadeshybrid_fix_cpus = -1 + metabat_rng_seed = 1 + + // Annotation options + skip_metaeuk = false + metaeuk_mmseqs_db = null + metaeuk_db = null + save_mmseqs_db = false // References - genome = null + //genome = null // we use --host_genome instead igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false @@ -160,16 +304,29 @@ profiles { ] } } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_host_rm { includeConfig 'conf/test_host_rm.config' } + test_hybrid { includeConfig 'conf/test_hybrid.config' } + test_hybrid_host_rm { includeConfig 'conf/test_hybrid_host_rm.config' } + test_busco_auto { includeConfig 'conf/test_busco_auto.config' } + test_ancient_dna { includeConfig 'conf/test_ancient_dna.config' } + test_adapterremoval { includeConfig 'conf/test_adapterremoval.config' } + test_binning_entry { includeConfig 'conf/test_binning_entry.config' } + test_binrefinement { includeConfig 'conf/test_binrefinement.config' } + test_no_clipping { includeConfig 'conf/test_no_clipping.config' } + test_bbnorm { includeConfig 'conf/test_bbnorm.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_virus_identification { includeConfig 'conf/test_virus_identification.config' } + test_single_end { includeConfig 'conf/test_single_end.config' } + test_concoct { includeConfig 'conf/test_concoct.config' } } // Load nf-core custom profiles from different Institutions includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" // Load nf-core/mag custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/mag.config" : "/dev/null" +includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/mag.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled @@ -228,46 +385,69 @@ manifest { name = 'nf-core/mag' author = """Hadrien Gourlé, Daniel Straub, Sabrina Krakau, James A. Fellows Yates, Maxime Borry""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ - // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ name: 'Hadrien Gourlé', - affiliation: '', + affiliation: 'Department of Animal Breeding and Genetics, Swedish University of Agricultural Sciences, Uppsala, Swden', email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + github: 'HadrienG', + contribution: ['author'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-9807-1082 ' ], [ - name: ' Daniel Straub', - affiliation: '', + name: 'Daniel Straub', + affiliation: 'Quantitative Biology Center (QBiC), University of Tübingen, Tübingen, Germany', email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + github: 'd4straub', + contribution: ['author', 'maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0002-2553-0660 ' ], [ - name: ' Sabrina Krakau', - affiliation: '', + name: 'Sabrina Krakau', + affiliation: 'Quantitative Biology Center (QBiC), University of Tübingen, Tübingen, Germany', email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + github: 'skrakau', + contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0003-0603-7907 ' ], [ - name: ' James A. Fellows Yates', - affiliation: '', + name: 'Antonia Schuster', + affiliation: 'Quantitative Biology Center (QBiC), University of Tübingen, Tübingen, Germany', email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') + github: 'AntoniaSchuster', + contribution: ['author'], // List of contribution types ('author', 'maintainer' or 'contributor') orcid: '' ], [ - name: ' Maxime Borry', - affiliation: '', + name: 'James A. Fellows Yates', + affiliation: 'Department of Archaeogenetics, Max Planck Institute for Evolutionary Anthropology, Leipzig, Germany', + email: 'jfy133@gmail.com', + github: 'jfy133', + contribution: ['maintainer'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-5585-6277' + ], + [ + name: 'Maxime Borry', + affiliation: 'Department of Archaeogenetics, Max Planck Institute for Evolutionary Anthropology, Leipzig, Germany', email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + github: 'maxibor', + contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-9140-7559' + ], + [ + name: 'Jim Downie', + affiliation: 'Wellcome Sanger Institute, Hinxton, UK', + email: '', + github: 'prototaxites', + contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0002-7175-0533' + ], + [ + name: 'Carson Miller', + affiliation: 'University of Washington, Seattle, USA', + email: '0000-0002-7175-0533', + github: 'CarsonJM', + contribution: ['contributor'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: '0000-0001-9861-4884' ], ] homePage = 'https://github.com/nf-core/mag' @@ -276,7 +456,7 @@ manifest { defaultBranch = 'master' nextflowVersion = '!>=24.04.2' version = '3.3.1dev' - doi = '' + doi = '10.1093/nargab/lqac007' } // Nextflow plugins diff --git a/nextflow_schema.json b/nextflow_schema.json index 5f022229..18307b05 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -14,13 +14,30 @@ "properties": { "input": { "type": "string", - "format": "file-path", + "mimetype": "text/csv", + "format": "file-path-pattern", "exists": true, "schema": "assets/schema_input.json", + "pattern": "^\\S+\\.csv$", + "description": "CSV samplesheet file containing information about the samples in the experiment.", + "help_text": "Use this to specify the location of your input FastQ files and their associated metadata. You can also use the CSV file to assign different groups or to include long reads for hybrid assembly with metaSPAdes. The CSV file must have at least two columns (sample, short_reads1) and with a maximum CSV sheet having the headers: sample,run,group,short_reads_1,short_reads_2,long_reads. See [usage docs](https://nf-co.re/mag/usage#input-specifications).", + "fa_icon": "fas fa-file-csv" + }, + "single_end": { + "type": "boolean", + "description": "Specifies that the input is single-end reads.", + "fa_icon": "fas fa-align-center", + "help_text": "By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--single_end` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--input`. For example:\n\n```bash\n--single_end --input '*.fastq'\n```\n\nIt is not possible to run a mixture of single-end and paired-end files in one run." + }, + "assembly_input": { + "type": "string", "mimetype": "text/csv", + "format": "file-path-pattern", + "exists": true, + "schema": "assets/schema_assembly_input.json", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/mag/usage#samplesheet-input).", + "description": "Additional input CSV samplesheet containing information about pre-computed assemblies. When set, both read pre-processing and assembly are skipped and the pipeline begins at the binning stage.", + "help_text": "If you have pre-computed assemblies from another source, it is possible to jump straight to the binning stage of the pipeline by supplying these assemblies in a CSV file. This CSV file should have at minimum three columns and the following header: `id,group,assembler,fasta` (group is only required when `--coassemble_group). Short reads must still be supplied in to `--input` in CSV format. See [usage docs](https://nf-co.re/mag/usage#input-specifications) for further details.", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -49,22 +66,6 @@ "fa_icon": "fas fa-dna", "description": "Reference genome related files and options required for the workflow.", "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", @@ -152,6 +153,11 @@ "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, + "monochrome_logs": { + "type": "boolean", + "description": "Use monochrome_logs", + "hidden": true + }, "email_on_fail": { "type": "string", "description": "Email address for completion summary, only when pipeline fails.", @@ -174,12 +180,6 @@ "fa_icon": "fas fa-file-upload", "hidden": true }, - "monochrome_logs": { - "type": "boolean", - "description": "Do not use coloured log outputs.", - "fa_icon": "fas fa-palette", - "hidden": true - }, "hook_url": { "type": "string", "description": "Incoming hook URL for messaging service", @@ -226,6 +226,673 @@ "hidden": true } } + }, + "reproducibility_options": { + "title": "Reproducibility options", + "type": "object", + "description": "Use these parameters to also enable reproducible results from the individual assembly and binning tools .", + "default": "", + "properties": { + "megahit_fix_cpu_1": { + "type": "boolean", + "description": "Fix number of CPUs for MEGAHIT to 1. Not increased with retries.", + "help_text": "MEGAHIT only generates reproducible results when run single-threaded. \n\nWhen using this parameter do not change the number of CPUs for the `megahit` process with a custom config file. This would result in an error.\n\nDefault: The number of CPUs is specified in the `base.config` file, and increased with each retry." + }, + "spades_fix_cpus": { + "type": "integer", + "default": -1, + "description": "Fix number of CPUs used by SPAdes. Not increased with retries.", + "help_text": "SPAdes is designed to be deterministic for a given number of threads. To generate reproducible results fix the number of CPUs using this parameter.\n\nWhen using this parameter do not change the number of CPUs for the `spades` process with a custom config file. This would result in an error.\n\nDefault: -1 (the number of CPUs is specified in the `base.config` or in a custom config file, and increased with each retry)." + }, + "spadeshybrid_fix_cpus": { + "type": "integer", + "default": -1, + "description": "Fix number of CPUs used by SPAdes hybrid. Not increased with retries.", + "help_text": "SPAdes is designed to be deterministic for a given number of threads. To generate reproducible results fix the number of CPUs using this parameter.\n\nWhen using this parameter do not change the number of CPUs for the `spadeshybrid` process with a custom config file. This would result in an error.\n\nDefault: -1 (the number of CPUs is specified in the `base.config` or in a custom config file, and increased with each retry)." + }, + "metabat_rng_seed": { + "type": "integer", + "default": 1, + "description": "RNG seed for MetaBAT2.", + "help_text": "MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results. You can set it also to any other positive integer to ensure reproducibility. Set the parameter to 0 to use a random seed." + } + }, + "help_text": "" + }, + "quality_control_for_short_reads_options": { + "title": "Quality control for short reads options", + "type": "object", + "description": "", + "default": "", + "properties": { + "clip_tool": { + "type": "string", + "default": "fastp", + "description": "Specify which adapter clipping tool to use.", + "enum": ["fastp", "adapterremoval"] + }, + "save_clipped_reads": { + "type": "boolean", + "description": "Specify to save the resulting clipped FASTQ files to --outdir." + }, + "reads_minlength": { + "type": "integer", + "default": 15, + "description": "The minimum length of reads must have to be retained for downstream analysis." + }, + "fastp_qualified_quality": { + "type": "integer", + "default": 15, + "description": "Minimum phred quality value of a base to be qualified in fastp.", + "help": "Reads with more than 40% of unqualified bases will be discarded." + }, + "fastp_cut_mean_quality": { + "type": "integer", + "default": 15, + "description": "The mean quality requirement used for per read sliding window cutting by fastp.", + "help": "Used in combination with the fastp options '--cut_front' and '--cut_tail'. If the mean quality within a window (of size 4) is below `--fastp_cut_mean_quality`, the bases are dropped and the sliding window is moved further, otherwise it stops." + }, + "fastp_save_trimmed_fail": { + "type": "boolean", + "description": "Save reads that fail fastp filtering in a separate file. Not used downstream." + }, + "adapterremoval_minquality": { + "type": "integer", + "default": 2, + "description": "The minimum base quality for low-quality base trimming by AdapterRemoval." + }, + "adapterremoval_trim_quality_stretch": { + "type": "boolean", + "description": "Turn on quality trimming by consecutive stretch of low quality bases, rather than by window.", + "help_text": "Default base-quality trimming is set to trim by 'windows', as in FastP. Specifying this flag will use trim via contiguous stretch of low quality bases (Ns) instead.\n\n> Replaces --trimwindows 4 with --trimqualities in AdapterRemoval" + }, + "adapterremoval_adapter1": { + "type": "string", + "default": "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG", + "description": "Forward read adapter to be trimmed by AdapterRemoval." + }, + "adapterremoval_adapter2": { + "type": "string", + "default": "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT", + "description": "Reverse read adapter to be trimmed by AdapterRemoval for paired end data." + }, + "host_genome": { + "type": "string", + "help_text": "This parameter is mutually exclusive with `--host_fasta`. Host read removal is done with Bowtie2. \nBoth the iGenomes FASTA file as well as corresponding, already pre-built Bowtie 2 index files will be used.", + "description": "Name of iGenomes reference for host contamination removal." + }, + "host_fasta": { + "type": "string", + "description": "Fasta reference file for host contamination removal.", + "help_text": "This parameter is mutually exclusive with `--host_genome`. The reference can be masked. Host read removal is done with Bowtie2." + }, + "host_fasta_bowtie2index": { + "type": "string", + "description": "Bowtie2 index directory corresponding to `--host_fasta` reference file for host contamination removal.", + "help_text": "This parameter must be used in combination with `--host_fasta`, and should be a directory containing files from the output of `bowtie2-build`, i.e. files ending in `.bt2`" + }, + "host_removal_verysensitive": { + "type": "boolean", + "description": "Use the `--very-sensitive` instead of the`--sensitive`setting for Bowtie 2 to map reads against the host genome." + }, + "host_removal_save_ids": { + "type": "boolean", + "description": "Save the read IDs of removed host reads." + }, + "save_hostremoved_reads": { + "type": "boolean", + "description": "Specify to save input FASTQ files with host reads removed to --outdir." + }, + "keep_phix": { + "type": "boolean", + "description": "Keep reads similar to the Illumina internal standard PhiX genome." + }, + "phix_reference": { + "type": "string", + "default": "${baseDir}/assets/data/GCA_002596845.1_ASM259684v1_genomic.fna.gz", + "description": "Genome reference used to remove Illumina PhiX contaminant reads.", + "hidden": true + }, + "skip_clipping": { + "type": "boolean", + "description": "Skip read preprocessing using fastp or adapterremoval." + }, + "save_phixremoved_reads": { + "type": "boolean", + "description": "Specify to save input FASTQ files with phiX reads removed to --outdir." + }, + "bbnorm": { + "type": "boolean", + "description": "Run BBnorm to normalize sequence depth." + }, + "bbnorm_target": { + "type": "integer", + "default": 100, + "description": "Set BBnorm target maximum depth to this number." + }, + "bbnorm_min": { + "type": "integer", + "default": 5, + "description": "Set BBnorm minimum depth to this number." + }, + "save_bbnorm_reads": { + "type": "boolean", + "description": "Save normalized read files to output directory." + } + } + }, + "quality_control_for_long_reads_options": { + "title": "Quality control for long reads options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_adapter_trimming": { + "type": "boolean", + "description": "Skip removing adapter sequences from long reads." + }, + "longreads_min_length": { + "type": "integer", + "default": 1000, + "description": "Discard any read which is shorter than this value." + }, + "longreads_min_quality": { + "type": "integer", + "description": "Discard any read which has a mean quality score lower than this value." + }, + "longreads_keep_percent": { + "type": "integer", + "default": 90, + "description": "Keep this percent of bases." + }, + "longreads_length_weight": { + "type": "integer", + "default": 10, + "description": "The higher the more important is read length when choosing the best reads.", + "help_text": "The default value focuses on length instead of quality to improve assembly size.\nIn order to assign equal weights to read lengths and read qualities set this parameter to 1.\nThis might be useful, for example, to benefit indirectly from the removal of short host reads (causing lower qualities for reads not overlapping filtered short reads)." + }, + "keep_lambda": { + "type": "boolean", + "description": "Keep reads similar to the ONT internal standard Escherichia virus Lambda genome." + }, + "lambda_reference": { + "type": "string", + "default": "${baseDir}/assets/data/GCA_000840245.1_ViralProj14204_genomic.fna.gz", + "hidden": true, + "description": "Genome reference used to remove ONT Lambda contaminant reads." + }, + "save_lambdaremoved_reads": { + "type": "boolean", + "description": "Specify to save input FASTQ files with lamba reads removed to --outdir." + }, + "save_porechop_reads": { + "type": "boolean", + "description": "Specify to save the resulting clipped FASTQ files to --outdir." + }, + "save_filtered_longreads": { + "type": "boolean", + "description": "Specify to save the resulting length filtered long read FASTQ files to --outdir." + }, + "longread_adaptertrimming_tool": { + "type": "string", + "description": "Specify which long read adapter trimming tool to use.", + "enum": ["porechop", "porechop_abi"], + "default": "porechop_abi" + }, + "longread_filtering_tool": { + "type": "string", + "description": "Specify which long read filtering tool to use.", + "enum": ["filtlong", "nanoq", "chopper"], + "default": "filtlong" + } + } + }, + "taxonomic_profiling_options": { + "title": "Taxonomic profiling options", + "type": "object", + "description": "Taxonomic classification is disabled by default. You have to specify one of the options below to activate it.", + "default": "", + "properties": { + "centrifuge_db": { + "type": "string", + "format": "file-path", + "exists": true, + "description": "Database for taxonomic binning with centrifuge.", + "help_text": "Local directory containing `*.cf` files, or a URL or local path to a downloaded compressed tar archive of a Centrifuge database. E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz." + }, + "kraken2_db": { + "type": "string", + "format": "file-path", + "description": "Database for taxonomic binning with kraken2.", + "help_text": "Path to a local directory, archive file, or a URL to compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz." + }, + "krona_db": { + "type": "string", + "description": "Database for taxonomic binning with krona", + "help_text": "Path to `taxonomy.tab` file for Krona, instead of downloading the default file. Point at the `.tab` file." + }, + "skip_krona": { + "type": "boolean", + "description": "Skip creating a krona plot for taxonomic binning." + }, + "cat_db": { + "type": "string", + "description": "Database for taxonomic classification of metagenome assembled genomes. Can be either a zipped file or a directory containing the extracted output of such.", + "help_text": "E.g. https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20210107.tar.gz. This parameter is mutually exclusive with `--cat_db_generate`. The file needs to contain a folder named `*taxonomy*` and `*database*` that hold the respective files." + }, + "cat_db_generate": { + "type": "boolean", + "description": "Generate CAT database.", + "help_text": "Download the taxonomy files from NCBI taxonomy, the nr database and generate CAT database. This parameter is mutually exclusive with `--cat_db`. Useful to build a CAT database with the same DIAMOND version as used for running CAT classification, avoiding compatibility problems." + }, + "save_cat_db": { + "type": "boolean", + "description": "Save the CAT database generated when specified by `--cat_db_generate`.", + "help_text": "Useful to allow reproducibility, as old versions of prebuild CAT databases do not always remain accessible and underlying NCBI taxonomy and nr databases change." + }, + "cat_official_taxonomy": { + "type": "boolean", + "description": "Only return official taxonomic ranks (Kingdom, Phylum, etc.) when running CAT." + }, + "skip_gtdbtk": { + "type": "boolean", + "description": "Skip the running of GTDB, as well as the automatic download of the database" + }, + "gtdb_db": { + "type": "string", + "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", + "default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + }, + "gtdb_mash": { + "type": "string", + "description": "Specify the location of a GTDBTK mash database. If missing, GTDB-Tk will skip the ani_screening step" + }, + "gtdbtk_min_completeness": { + "type": "number", + "default": 50, + "description": "Min. bin completeness (in %) required to apply GTDB-tk classification.", + "help_text": "Completeness assessed with BUSCO analysis (100% - %Missing). Must be greater than 0 (min. 0.01) to avoid GTDB-tk errors. If too low, GTDB-tk classification results can be impaired due to not enough marker genes!", + "minimum": 0.01, + "maximum": 100 + }, + "gtdbtk_max_contamination": { + "type": "number", + "default": 10, + "description": "Max. bin contamination (in %) allowed to apply GTDB-tk classification.", + "help_text": "Contamination approximated based on BUSCO analysis (%Complete and duplicated). If too high, GTDB-tk classification results can be impaired due to contamination!", + "minimum": 0, + "maximum": 100 + }, + "gtdbtk_min_perc_aa": { + "type": "number", + "default": 10, + "description": "Min. fraction of AA (in %) in the MSA for bins to be kept.", + "minimum": 0, + "maximum": 100 + }, + "gtdbtk_min_af": { + "type": "number", + "default": 0.65, + "description": "Min. alignment fraction to consider closest genome.", + "minimum": 0, + "maximum": 1 + }, + "gtdbtk_pplacer_cpus": { + "type": "integer", + "default": 1, + "description": "Number of CPUs used for the by GTDB-Tk run tool pplacer.", + "help_text": "A low number of CPUs helps to reduce the memory required/reported by GTDB-Tk. See also the [GTDB-Tk documentation](https://ecogenomics.github.io/GTDBTk/faq.html#gtdb-tk-reaches-the-memory-limit-pplacer-crashes)." + }, + "gtdbtk_pplacer_useram": { + "type": "boolean", + "description": "Speed up pplacer step of GTDB-Tk by loading to memory.", + "help_text": "Will be faster than writing to disk (default setting), however at the expense of much larger memory (RAM) requirements for GDTBTK/CLASSIFY." + }, + "genomad_db": { + "type": "string", + "description": "Database for virus classification with geNomad", + "help_text": "Must be a directory containing the uncompressed contents from https://zenodo.org/doi/10.5281/zenodo.6994741 (nf-core/mag tested with v1.1)" + } + } + }, + "assembly_options": { + "title": "Assembly options", + "type": "object", + "description": "", + "default": "", + "properties": { + "coassemble_group": { + "type": "boolean", + "description": "Co-assemble samples within one group, instead of assembling each sample separately." + }, + "spades_options": { + "type": "string", + "description": "Additional custom options for SPAdes and SPAdesHybrid. Do not specify `--meta` as this will be added for you!", + "help_text": "An example is adjusting k-mers (\"-k 21,33,55,77\") or adding [advanced options](https://github.com/ablab/spades#advanced-options). But not --meta, -t, -m, -o or --out-prefix, because these are already in use. Must be used like this: --spades_options \"-k 21,33,55,77\")" + }, + "megahit_options": { + "type": "string", + "description": "Additional custom options for MEGAHIT.", + "help_text": "An example is adjusting presets (e.g. \"--presets meta-large\"), k-mers (e.g. \"-k 21,33,55,77\") or adding other [advanced options](https://github.com/voutcn/megahit#advanced-usage). For example, increase the minimum k-mer in the event of an error message such as \"Too many vertices in the unitig graph, you may increase the kmer size to remove tons of erroneous kmers.\" in the MEGAHIT log file. But not --threads, --memory, -o or input read files, because these are already in use. Must be used like this: --megahit_options \"--presets meta-large\"" + }, + "skip_spades": { + "type": "boolean", + "description": "Skip Illumina-only SPAdes assembly." + }, + "skip_spadeshybrid": { + "type": "boolean", + "description": "Skip SPAdes hybrid assembly." + }, + "skip_megahit": { + "type": "boolean", + "description": "Skip MEGAHIT assembly." + }, + "skip_quast": { + "type": "boolean", + "description": "Skip metaQUAST." + } + } + }, + "gene_prediction_and_annotation_options": { + "title": "Gene prediction and annotation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_prodigal": { + "type": "boolean", + "description": "Skip Prodigal gene prediction" + }, + "prokka_with_compliance": { + "type": "boolean", + "help_text": "Sometimes Prokka will complain that your contig names are too long and fail.\n\nThis particularly happens with metaSPAdes assemblies.\n\nYou can turn on this flag which will tell Prokka to truncate the contig names for you.\nHowever this also requires you to specify a sequencing centre name (specified with `--prokka_compliance_centre`).\n\n:::warning\nTruncating contig names may make it harder to associated contig annotations with their original contigs!\n:::\n", + "description": "Turn on Prokka complicance mode for truncating contig names for NCBI/ENA compatibility." + }, + "prokka_compliance_centre": { + "type": "string", + "help_text": "Specify the sequencing centre name for making NCBI Genbank/ENA compatible annotation files (required when specifying `--prokka_with_compliance`).", + "description": "Specify sequencing centre name required for Prokka's compliance mode." + }, + "skip_prokka": { + "type": "boolean", + "description": "Skip Prokka genome annotation." + }, + "skip_metaeuk": { + "type": "boolean", + "description": "Skip MetaEuk gene prediction and annotation" + }, + "metaeuk_mmseqs_db": { + "type": "string", + "description": "A string containing the name of one of the databases listed in the [mmseqs2 documentation](https://github.com/soedinglab/MMseqs2/wiki#downloading-databases). This database will be downloaded and formatted for eukaryotic genome annotation. Incompatible with --metaeuk_db.", + "help_text": "mmseqs2 lists a large number of databases, not all of which are appropriate for use with MetaEuk. MetaEuk requires protein inputs, so you should select one of the Aminoacid or Profile options." + }, + "metaeuk_db": { + "type": "string", + "description": "Path to either a local fasta file of protein sequences, or to a directory containing an mmseqs2-formatted database, for annotation of eukaryotic genomes.", + "help_text": "One option would be the databases from the MetaEuk publication (https://wwwuser.gwdg.de/~compbiol/metaeuk/), however it should be noted that these are focused on marine eukaryotes." + }, + "save_mmseqs_db": { + "type": "boolean", + "description": "Save the downloaded mmseqs2 database specified in `--metaeuk_mmseqs_db`." + } + } + }, + "virus_identification_options": { + "title": "Virus identification options", + "type": "object", + "default": "", + "properties": { + "run_virus_identification": { + "type": "boolean", + "description": "Run virus identification." + }, + "genomad_min_score": { + "type": "number", + "default": 0.7, + "description": "Minimum geNomad score for a sequence to be considered viral" + }, + "genomad_splits": { + "type": "integer", + "default": 1, + "description": "Number of groups that geNomad's MMSeqs2 databse should be split into (reduced memory requirements)" + } + } + }, + "binning_options": { + "title": "Binning options", + "type": "object", + "description": "", + "default": "", + "properties": { + "binning_map_mode": { + "type": "string", + "default": "group", + "description": "Defines mapping strategy to compute co-abundances for binning, i.e. which samples will be mapped against the assembly.", + "help_text": "Available: `all`, `group` or `own`. Note that `own` cannot be specified in combination with `--coassemble_group`.\n\nNote that specifying `all` without additionally specifying `--coassemble_group` results in `n^2` mapping processes for each assembly method, where `n` is the number of samples.", + "enum": ["all", "group", "own"] + }, + "skip_binning": { + "type": "boolean", + "description": "Skip metagenome binning entirely" + }, + "skip_metabat2": { + "type": "boolean", + "description": "Skip MetaBAT2 Binning" + }, + "skip_maxbin2": { + "type": "boolean", + "description": "Skip MaxBin2 Binning" + }, + "skip_concoct": { + "type": "boolean", + "description": "Skip CONCOCT Binning" + }, + "min_contig_size": { + "type": "integer", + "default": 1500, + "description": "Minimum contig size to be considered for binning and for bin quality check.", + "help_text": "For forwarding into downstream analysis, i.e. QUAST and BUSCO, and reporting." + }, + "min_length_unbinned_contigs": { + "type": "integer", + "default": 1000000, + "description": "Minimal length of contigs that are not part of any bin but treated as individual genome.", + "help_text": "Contigs that do not fulfill the thresholds of `--min_length_unbinned_contigs` and `--max_unbinned_contigs` are pooled for downstream analysis and reporting, except contigs that also do not fullfill `--min_contig_size` are not considered further." + }, + "max_unbinned_contigs": { + "type": "integer", + "default": 100, + "description": "Maximal number of contigs that are not part of any bin but treated as individual genome.", + "help_text": "Contigs that do not fulfill the thresholds of `--min_length_unbinned_contigs` and `--max_unbinned_contigs` are pooled for downstream analysis and reporting, except contigs that also do not fullfill `--min_contig_size` are not considered further." + }, + "bowtie2_mode": { + "type": "string", + "description": "Bowtie2 alignment mode", + "help_text": "Bowtie2 alignment mode options, for example: `--very-fast` , `--very-sensitive-local -N 1` , ... Must be used like this: --bowtie2_mode \"--very-sensitive\"" + }, + "save_assembly_mapped_reads": { + "type": "boolean", + "description": "Save the output of mapping raw reads back to assembled contigs", + "help_text": "Specify to save the BAM and BAI files generated when mapping input reads back to the assembled contigs (performed in preparation for binning and contig depth estimations)." + }, + "bin_domain_classification": { + "type": "boolean", + "description": "Enable domain-level (prokaryote or eukaryote) classification of bins using Tiara. Processes which are domain-specific will then only receive bins matching the domain requirement.", + "help_text": "Enable this if it is likely that your metagenome samples contain a mixture of eukaryotic and prokaryotic genomes. This will ensure that prokaryote-only steps only receive putatively prokaryotic genomes, and vice-versa. Additionally, may improve the performance of DAS Tool by ensuring it only receives prokaryotic genomes." + }, + "bin_domain_classification_tool": { + "type": "string", + "default": "tiara", + "description": "Specify which tool to use for domain classification of bins. Currently only 'tiara' is implemented.", + "hidden": true + }, + "tiara_min_length": { + "type": "integer", + "default": 3000, + "description": "Minimum contig length for Tiara to use for domain classification. For accurate classification, should be longer than 3000 bp." + }, + "exclude_unbins_from_postbinning": { + "type": "boolean", + "description": "Exclude unbinned contigs in the post-binning steps (bin QC, taxonomic classification, and annotation steps).", + "help": "If you're not interested in assemby results that are not considered 'genome level', excluding unbinned contigs can greatly speed up downstream steps such as Prokka, that can be quite slow and spin up many tasks." + } + } + }, + "bin_quality_check_options": { + "title": "Bin quality check options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_binqc": { + "type": "boolean", + "description": "Disable bin QC with BUSCO, CheckM or CheckM2." + }, + "binqc_tool": { + "type": "string", + "default": "busco", + "description": "Specify which tool for bin quality-control validation to use.", + "enum": ["busco", "checkm", "checkm2"] + }, + "busco_db": { + "type": "string", + "description": "Download URL for BUSCO lineage dataset, or path to a tar.gz archive, or local directory containing already downloaded and unpacked lineage datasets.", + "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2024-01-08.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/." + }, + "busco_auto_lineage_prok": { + "type": "boolean", + "description": "Run BUSCO with automated lineage selection, but ignoring eukaryotes (saves runtime)." + }, + "save_busco_db": { + "type": "boolean", + "description": "Save the used BUSCO lineage datasets provided via `--busco_db`.", + "help_text": "Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible." + }, + "busco_clean": { + "type": "boolean", + "description": "Enable clean-up of temporary files created during BUSCO runs.", + "help_text": "By default, BUSCO creates a large number of intermediate files every run. This may cause problems on some clusters which have file number limits in plate, particularly with large numbers of bins. Enabling this option cleans these files, reducing the total file count of the work directory." + }, + "checkm_download_url": { + "type": "string", + "default": "https://zenodo.org/records/7401545/files/checkm_data_2015_01_16.tar.gz", + "hidden": true, + "description": "URL pointing to checkM database for auto download, if local path not supplied.", + "help_text": "You can use this parameter to point to an online copy of the checkM database TAR archive that the pipeline will use for auto download if a local path is not supplied to `--checkm_db`." + }, + "checkm_db": { + "type": "string", + "description": "Path to local folder containing already downloaded and uncompressed CheckM database.", + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm_db`." + }, + "save_checkm_data": { + "type": "boolean", + "description": "Save the used CheckM reference files downloaded when not using --checkm_db parameter.", + "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [CheckM FTP server](https://data.ace.uq.edu.au/public/CheckM_databases/) will be stored in your output directory alongside your CheckM results." + }, + "checkm2_db": { + "type": "string", + "description": "Path to local folder containing already downloaded and uncompressed CheckM2 database (.dmnd file).", + "help_text": "The pipeline can also download this for you if not specified, and you can save the resulting directory into your output directory by specifying `--save_checkm2_data`. You should move this directory to somewhere else on your machine (and supply back to the pipeline in future runs again with `--checkm2_db`)." + }, + "checkm2_db_version": { + "type": "integer", + "default": 5571251, + "description": "CheckM2 database version number to download (Zenodo record ID, for reference check the canonical reference https://zenodo.org/records/5571251, and pick the Zenodo ID of the database version of your choice)." + }, + "save_checkm2_data": { + "type": "boolean", + "description": "Save the used CheckM2 reference files downloaded when not using --checkm2_db parameter.", + "help_text": "If specified, the directories and files decompressed from the `tar.gz` file downloaded from the [Zenodo repository](https://zenodo.org/records/5571251) will be stored in your output directory alongside your CheckM2 results." + }, + "refine_bins_dastool": { + "type": "boolean", + "description": "Turn on bin refinement using DAS Tool." + }, + "refine_bins_dastool_threshold": { + "type": "number", + "default": 0.5, + "description": "Specify single-copy gene score threshold for bin refinement.", + "help_text": "Score threshold for single-copy gene selection algorithm to keep selecting bins, with a value ranging from 0-1.\n\nFor description of scoring algorithm, see: Sieber, Christian M. K., et al. 2018. Nature Microbiology 3 (7): 836\u201343. https://doi.org/10.1038/s41564-018-0171-1.\n\n> Modifies DAS Tool parameter --score_threshold\n" + }, + "postbinning_input": { + "type": "string", + "default": "raw_bins_only", + "description": "Specify which binning output is sent for downstream annotation, taxonomic classification, bin quality control etc.", + "help_text": "`raw_bins_only`: only bins (and unbinned contigs) from the binners.\n`refined_bins_only`: only bins (and unbinned contigs) from the bin refinement step .\n\n ~~`both`: bins and unbinned contigs from both the binning and bin refinement steps.~~ `both` option is disabled in v2.4 due a bug that will be fixed in a later release.", + "enum": ["raw_bins_only", "refined_bins_only", "both"] + }, + "run_gunc": { + "type": "boolean", + "description": "Turn on GUNC genome chimerism checks" + }, + "gunc_db": { + "type": "string", + "description": "Specify a path to a pre-downloaded GUNC dmnd database file" + }, + "gunc_database_type": { + "type": "string", + "default": "progenomes", + "description": "Specify which database to auto-download if not supplying own", + "enum": ["progenomes", "gtdb"] + }, + "gunc_save_db": { + "type": "boolean", + "description": "Save the used GUNC reference files downloaded when not using --gunc_db parameter.", + "help_text": "If specified, the corresponding DIAMOND file downloaded from the GUNC server will be stored in your output directory alongside your GUNC results." + } + } + }, + "ancient_dna_assembly": { + "title": "Ancient DNA assembly", + "type": "object", + "description": "Performs ancient DNA assembly validation and contig consensus sequence recalling.", + "default": "", + "properties": { + "ancient_dna": { + "type": "boolean", + "description": "Turn on/off the ancient DNA subworfklow" + }, + "pydamage_accuracy": { + "type": "number", + "default": 0.5, + "description": "PyDamage accuracy threshold" + }, + "skip_ancient_damagecorrection": { + "type": "boolean", + "description": "deactivate damage correction of ancient contigs using variant and consensus calling" + }, + "freebayes_ploidy": { + "type": "integer", + "default": 1, + "description": "Ploidy for variant calling" + }, + "freebayes_min_basequality": { + "type": "integer", + "default": 20, + "description": "minimum base quality required for variant calling" + }, + "freebayes_minallelefreq": { + "type": "number", + "default": 0.33, + "description": "minimum minor allele frequency for considering variants" + }, + "bcftools_view_high_variant_quality": { + "type": "integer", + "default": 30, + "description": "minimum genotype quality for considering a variant high quality" + }, + "bcftools_view_medium_variant_quality": { + "type": "integer", + "default": 20, + "description": "minimum genotype quality for considering a variant medium quality" + }, + "bcftools_view_minimal_allelesupport": { + "type": "integer", + "default": 3, + "description": "minimum number of bases supporting the alternative allele" + } + } } }, "allOf": [ @@ -240,6 +907,36 @@ }, { "$ref": "#/$defs/generic_options" + }, + { + "$ref": "#/$defs/reproducibility_options" + }, + { + "$ref": "#/$defs/quality_control_for_short_reads_options" + }, + { + "$ref": "#/$defs/quality_control_for_long_reads_options" + }, + { + "$ref": "#/$defs/taxonomic_profiling_options" + }, + { + "$ref": "#/$defs/assembly_options" + }, + { + "$ref": "#/$defs/gene_prediction_and_annotation_options" + }, + { + "$ref": "#/$defs/virus_identification_options" + }, + { + "$ref": "#/$defs/binning_options" + }, + { + "$ref": "#/$defs/bin_quality_check_options" + }, + { + "$ref": "#/$defs/ancient_dna_assembly" } ] } diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 241911b9..de24da83 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -31,6 +31,9 @@ { "@id": "assets/" }, + { + "@id": "bin/" + }, { "@id": "conf/" }, @@ -43,6 +46,9 @@ { "@id": "modules/" }, + { + "@id": "modules/local/" + }, { "@id": "modules/nf-core/" }, @@ -123,8 +129,23 @@ "@id": "main.nf", "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow"], "creator": [ + { + "@id": "#jfy133@gmail.com" + }, + { + "@id": "#68351153+CarsonJM@users.noreply.github.com" + }, + { + "@id": "https://orcid.org/0000-0003-0603-7907" + }, + { + "@id": "#42973691+d4straub@users.noreply.github.com" + }, { "@id": "https://orcid.org/0000-0001-9807-1082" + }, + { + "@id": "#jfy133@gmail.com" } ], "dateCreated": "", @@ -144,6 +165,15 @@ ], "license": ["MIT"], "maintainer": [ + { + "@id": "#jfy133@gmail.com" + }, + { + "@id": "https://orcid.org/0000-0003-0603-7907" + }, + { + "@id": "#42973691+d4straub@users.noreply.github.com" + }, { "@id": "https://orcid.org/0000-0001-9807-1082" } @@ -206,6 +236,11 @@ "@type": "Dataset", "description": "Additional files" }, + { + "@id": "bin/", + "@type": "Dataset", + "description": "Scripts that must be callable from a pipeline process" + }, { "@id": "conf/", "@type": "Dataset", @@ -226,6 +261,11 @@ "@type": "Dataset", "description": "Modules used by the pipeline" }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, { "@id": "modules/nf-core/", "@type": "Dataset", @@ -312,6 +352,30 @@ "name": "nf-core", "url": "https://nf-co.re/" }, + { + "@id": "#jfy133@gmail.com", + "@type": "Person", + "email": "jfy133@gmail.com", + "name": "James Fellows Yates" + }, + { + "@id": "#68351153+CarsonJM@users.noreply.github.com", + "@type": "Person", + "email": "68351153+CarsonJM@users.noreply.github.com", + "name": "Carson J Miller" + }, + { + "@id": "https://orcid.org/0000-0003-0603-7907", + "@type": "Person", + "email": "sabrina.krakau.qbic@gmail.com", + "name": "Sabrina Krakau" + }, + { + "@id": "#42973691+d4straub@users.noreply.github.com", + "@type": "Person", + "email": "42973691+d4straub@users.noreply.github.com", + "name": "Daniel Straub" + }, { "@id": "https://orcid.org/0000-0001-9807-1082", "@type": "Person", diff --git a/subworkflows/local/ancient_dna.nf b/subworkflows/local/ancient_dna.nf new file mode 100644 index 00000000..868b16ee --- /dev/null +++ b/subworkflows/local/ancient_dna.nf @@ -0,0 +1,68 @@ +include { BCFTOOLS_CONSENSUS } from '../../modules/nf-core/bcftools/consensus/main' +include { BCFTOOLS_INDEX as BCFTOOLS_INDEX_PRE ; BCFTOOLS_INDEX as BCFTOOLS_INDEX_POST } from '../../modules/nf-core/bcftools/index/main' +include { BCFTOOLS_VIEW } from '../../modules/nf-core/bcftools/view/main' +include { FREEBAYES } from '../../modules/nf-core/freebayes/main' +include { PYDAMAGE_ANALYZE } from '../../modules/nf-core/pydamage/analyze/main' +include { PYDAMAGE_FILTER } from '../../modules/nf-core/pydamage/filter/main' +include { SAMTOOLS_FAIDX as FAIDX} from '../../modules/nf-core/samtools/faidx/main' + +workflow ANCIENT_DNA_ASSEMBLY_VALIDATION { + take: + input //channel: [val(meta), path(contigs), path(bam), path(bam_index)] + main: + ch_versions = Channel.empty() + + PYDAMAGE_ANALYZE( + input.map { + meta, contigs, bam, bai -> [ + meta, bam[0], bai[0] + ] + } + ) + + PYDAMAGE_FILTER(PYDAMAGE_ANALYZE.out.csv) + ch_versions = ch_versions.mix(PYDAMAGE_ANALYZE.out.versions.first()) + + if ( params.skip_ancient_damagecorrection ) { + ch_corrected_contigs = Channel.empty() + } + + if ( !params.skip_ancient_damagecorrection ) { + FAIDX(input.map { item -> [ item[0], item[1] ] }, [[],[]] ) + freebayes_input = input.join(FAIDX.out.fai) // [val(meta), path(contigs), path(bam), path(bam_index), path(fai)] + .multiMap{ + meta, contigs, bam, bai, fai -> + reads: [ meta, bam, bai, [], [], [] ] + fasta: [ contigs ] + fai: [ fai ] + } + FREEBAYES ( freebayes_input.reads, + freebayes_input.fasta, + freebayes_input.fai, + [], + [], + [] ) + + BCFTOOLS_INDEX_PRE(FREEBAYES.out.vcf) + BCFTOOLS_VIEW(FREEBAYES.out.vcf.join(BCFTOOLS_INDEX_PRE.out.tbi), [], [], []) + BCFTOOLS_INDEX_POST(BCFTOOLS_VIEW.out.vcf) + BCFTOOLS_CONSENSUS(BCFTOOLS_VIEW.out.vcf + .join(BCFTOOLS_INDEX_POST.out.tbi) + .join(input.map { item -> [ item[0], item[1] ] })) + + ch_corrected_contigs = BCFTOOLS_CONSENSUS.out.fasta + + ch_versions = ch_versions.mix(FAIDX.out.versions.first()) + ch_versions = ch_versions.mix(FREEBAYES.out.versions.first()) + ch_versions = ch_versions.mix(BCFTOOLS_CONSENSUS.out.versions.first()) + } + + + + emit: + contigs_recalled = ch_corrected_contigs // channel: [ val(meta), path(fasta) ] + pydamage_results = PYDAMAGE_ANALYZE.out.csv // channel: [ val(meta), path(csv) ] + pydamage_filtered_results = PYDAMAGE_FILTER.out.csv // channel: [ val(meta), path(csv) ] + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/local/bin_qc.nf b/subworkflows/local/bin_qc.nf new file mode 100644 index 00000000..5a83d140 --- /dev/null +++ b/subworkflows/local/bin_qc.nf @@ -0,0 +1,234 @@ +/* + * BUSCO/CheckM/CheckM2/GUNC: Quantitative measures for the assessment of genome assembly + */ + +include { ARIA2 as ARIA2_UNTAR } from '../../modules/nf-core/aria2/main' +include { CHECKM2_DATABASEDOWNLOAD } from '../../modules/nf-core/checkm2/databasedownload/main' +include { BUSCO_DB_PREPARATION } from '../../modules/local/busco_db_preparation' +include { BUSCO } from '../../modules/local/busco' +include { BUSCO_SAVE_DOWNLOAD } from '../../modules/local/busco_save_download' +include { BUSCO_SUMMARY } from '../../modules/local/busco_summary' +include { CHECKM_QA } from '../../modules/nf-core/checkm/qa/main' +include { CHECKM_LINEAGEWF } from '../../modules/nf-core/checkm/lineagewf/main' +include { CHECKM2_PREDICT } from '../../modules/nf-core/checkm2/predict/main' +include { COMBINE_TSV as COMBINE_BINQC_TSV } from '../../modules/local/combine_tsv' +include { GUNC_DOWNLOADDB } from '../../modules/nf-core/gunc/downloaddb/main' +include { GUNC_RUN } from '../../modules/nf-core/gunc/run/main' +include { GUNC_MERGECHECKM } from '../../modules/nf-core/gunc/mergecheckm/main' + + +workflow BIN_QC { + take: + ch_bins // [ [ meta] , fasta ], input bins (mandatory) + + main: + qc_summary = [] + ch_input_bins_for_qc = ch_bins.transpose() + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + + /* + ================================ + * Setup databases + ================================ + */ + + if (params.busco_db) { + ch_busco_db = file(params.busco_db, checkIfExists: true) + } + else { + ch_busco_db = [] + } + + if (params.checkm_db) { + ch_checkm_db = file(params.checkm_db, checkIfExists: true) + } + else if (params.binqc_tool == 'checkm') { + ARIA2_UNTAR(params.checkm_download_url) + ch_checkm_db = ARIA2_UNTAR.out.downloaded_file + } + else { + ch_checkm_db = [] + } + + if (params.checkm2_db) { + ch_checkm2_db = [[:], file(params.checkm2_db, checkIfExists: true)] + } + else if (params.binqc_tool == 'checkm2') { + CHECKM2_DATABASEDOWNLOAD(params.checkm2_db_version) + ch_checkm2_db = CHECKM2_DATABASEDOWNLOAD.out.database + } + else { + ch_checkm2_db = [] + } + + if (params.gunc_db) { + ch_gunc_db = file(params.gunc_db, checkIfExists: true) + } + else { + ch_gunc_db = Channel.empty() + } + + + /* + ================================ + * Run QC tools + ================================ + */ + + if (params.binqc_tool == "busco") { + /* + * BUSCO + */ + if (!ch_busco_db.isEmpty()) { + if (ch_busco_db.extension in ['gz', 'tgz']) { + // Expects to be tar.gz! + BUSCO_DB_PREPARATION(ch_busco_db) + ch_db_for_busco = BUSCO_DB_PREPARATION.out.db.map { meta, db -> + [[id: meta, lineage: 'Y'], db] + } + } + else if (ch_busco_db.isDirectory()) { + // Set meta to match expected channel cardinality for BUSCO + ch_db_for_busco = Channel + .of(ch_busco_db) + .collect { db -> + def basename = db.getBaseName() + def lineage = basename.contains('odb10') ? 'Y' : 'N' + [[id: basename, lineage: lineage], db] + } + } + } + else { + // Set BUSCO database to empty to allow for --auto-lineage + ch_db_for_busco = Channel + .of([[lineage: ''], []]) + .collect() + } + + if (params.save_busco_db) { + // publish files downloaded by Busco + ch_downloads = BUSCO.out.busco_downloads + .groupTuple() + .map { _lin, downloads -> downloads[0] } + .toSortedList() + .flatten() + BUSCO_SAVE_DOWNLOAD(ch_downloads) + + ch_versions = ch_versions.mix(BUSCO_SAVE_DOWNLOAD.out.versions.first()) + } + + BUSCO(ch_input_bins_for_qc, ch_db_for_busco) + + BUSCO_SUMMARY( + BUSCO.out.summary_domain.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.summary_specific.collect { _meta, summary -> summary }.ifEmpty([]), + BUSCO.out.failed_bin.collect { _meta, summary -> summary }.ifEmpty([]) + ) + + ch_multiqc_files = ch_multiqc_files.mix( + BUSCO.out.summary_domain.mix(BUSCO.out.summary_specific).map { _meta, summary -> summary } + ) + qc_summary = BUSCO_SUMMARY.out.summary + ch_versions = ch_versions.mix(BUSCO.out.versions.first()) + } + else if (params.binqc_tool == "checkm") { + /* + * CheckM + */ + ch_bins_for_checkmlineagewf = ch_input_bins_for_qc + .groupTuple() + .filter { meta, _bins -> + meta.domain != "eukarya" + } + .multiMap { meta, fa -> + reads: [meta, fa] + ext: fa.extension.unique().join("") + } + + CHECKM_LINEAGEWF(ch_bins_for_checkmlineagewf.reads, ch_bins_for_checkmlineagewf.ext, ch_checkm_db) + ch_versions = ch_versions.mix(CHECKM_LINEAGEWF.out.versions.first()) + + ch_checkmqa_input = CHECKM_LINEAGEWF.out.checkm_output + .join(CHECKM_LINEAGEWF.out.marker_file) + .map { meta, dir, marker -> + [meta, dir, marker, []] + } + + CHECKM_QA(ch_checkmqa_input, []) + + COMBINE_BINQC_TSV(CHECKM_QA.out.output.collect { summary -> summary[1] }) + + qc_summary = COMBINE_BINQC_TSV.out.combined + ch_versions = ch_versions.mix( + CHECKM_QA.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) + } + else if (params.binqc_tool == "checkm2") { + /* + * CheckM2 + */ + CHECKM2_PREDICT(ch_input_bins_for_qc.groupTuple(), ch_checkm2_db) + + COMBINE_BINQC_TSV(CHECKM2_PREDICT.out.checkm2_tsv.collect { summary -> summary[1] }) + + qc_summary = COMBINE_BINQC_TSV.out.combined + ch_versions = ch_versions.mix( + CHECKM2_PREDICT.out.versions.first(), + COMBINE_BINQC_TSV.out.versions + ) + } + + if (params.run_gunc) { + /* + * GUNC + */ + ch_input_bins_for_gunc = ch_bins + .filter { meta, _bins -> + meta.domain != "eukarya" + } + + if (params.gunc_db) { + ch_db_for_gunc = ch_gunc_db + } + else { + ch_db_for_gunc = GUNC_DOWNLOADDB(params.gunc_database_type).db + ch_versions.mix(GUNC_DOWNLOADDB.out.versions) + } + + GUNC_RUN(ch_input_bins_for_gunc, ch_db_for_gunc) + ch_versions.mix(GUNC_RUN.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_RUN.out.maxcss_level_tsv + .map { _meta, gunc_summary -> gunc_summary } + .collectFile( + name: "gunc_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) + + if (params.binqc_tool == 'checkm') { + ch_input_to_mergecheckm = GUNC_RUN.out.maxcss_level_tsv.combine(CHECKM_QA.out.output, by: 0) + + GUNC_MERGECHECKM(ch_input_to_mergecheckm) + ch_versions.mix(GUNC_MERGECHECKM.out.versions) + + // Make sure to keep directory in sync with modules.conf + GUNC_MERGECHECKM.out.tsv + .map { _meta, gunc_checkm_summary -> gunc_checkm_summary } + .collectFile( + name: "gunc_checkm_summary.tsv", + keepHeader: true, + storeDir: "${params.outdir}/GenomeBinning/QC/" + ) + } + } + + emit: + qc_summary = qc_summary + multiqc_files = ch_multiqc_files + versions = ch_versions +} diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf new file mode 100644 index 00000000..51caaeb9 --- /dev/null +++ b/subworkflows/local/binning.nf @@ -0,0 +1,139 @@ +/* + * Binning with MetaBAT2 and MaxBin2 + */ + +include { METABAT2_METABAT2 } from '../../modules/nf-core/metabat2/metabat2/main' +include { METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS } from '../../modules/nf-core/metabat2/jgisummarizebamcontigdepths/main' +include { MAXBIN2 } from '../../modules/nf-core/maxbin2/main' +include { GUNZIP as GUNZIP_BINS } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_UNBINS } from '../../modules/nf-core/gunzip/main' + +include { CONVERT_DEPTHS } from '../../modules/local/convert_depths' +include { ADJUST_MAXBIN2_EXT } from '../../modules/local/adjust_maxbin2_ext' +include { SPLIT_FASTA } from '../../modules/local/split_fasta' +include { FASTA_BINNING_CONCOCT } from '../../subworkflows/nf-core/fasta_binning_concoct/main' + +workflow BINNING { + take: + assemblies // channel: [ val(meta), path(assembly), path(bams), path(bais) ] + reads // channel: [ val(meta), [ reads ] ] + + main: + + ch_versions = Channel.empty() + + // generate coverage depths for each contig + ch_summarizedepth_input = assemblies + .map { meta, assembly, bams, bais -> + [ meta, bams, bais ] + } + + METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS ( ch_summarizedepth_input ) + + ch_metabat_depths = METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth + .map { meta, depths -> + def meta_new = meta + [binner: 'MetaBAT2'] + [ meta_new, depths ] + } + + ch_versions = ch_versions.mix(METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.versions.first()) + + // combine depths back with assemblies + ch_metabat2_input = assemblies + .map { meta, assembly, bams, bais -> + def meta_new = meta + [binner: 'MetaBAT2'] + [ meta_new, assembly, bams, bais ] + } + .join( ch_metabat_depths, by: 0 ) + .map { meta, assembly, bams, bais, depths -> + [ meta, assembly, depths ] + } + + // convert metabat2 depth files to maxbin2 + if ( !params.skip_maxbin2 ) { + CONVERT_DEPTHS ( ch_metabat2_input ) + ch_maxbin2_input = CONVERT_DEPTHS.out.output + .map { meta, assembly, reads, depth -> + def meta_new = meta + [binner: 'MaxBin2'] + [ meta_new, assembly, reads, depth ] + } + ch_versions = ch_versions.mix(CONVERT_DEPTHS.out.versions.first()) + } + + // main bins for decompressing for MAG_DEPTHS + ch_final_bins_for_gunzip = Channel.empty() + + // final gzipped bins + ch_binning_results_gzipped_final = Channel.empty() + + // run binning + if ( !params.skip_metabat2 ) { + METABAT2_METABAT2 ( ch_metabat2_input ) + // before decompressing first have to separate and re-group due to limitation of GUNZIP module + ch_final_bins_for_gunzip = ch_final_bins_for_gunzip.mix( METABAT2_METABAT2.out.fasta.transpose() ) + ch_binning_results_gzipped_final = ch_binning_results_gzipped_final.mix( METABAT2_METABAT2.out.fasta ) + ch_versions = ch_versions.mix(METABAT2_METABAT2.out.versions.first()) + } + if ( !params.skip_maxbin2 ) { + MAXBIN2 ( ch_maxbin2_input ) + ADJUST_MAXBIN2_EXT ( MAXBIN2.out.binned_fastas ) + ch_final_bins_for_gunzip = ch_final_bins_for_gunzip.mix( ADJUST_MAXBIN2_EXT.out.renamed_bins.transpose() ) + ch_binning_results_gzipped_final = ch_binning_results_gzipped_final.mix( ADJUST_MAXBIN2_EXT.out.renamed_bins ) + ch_versions = ch_versions.mix(MAXBIN2.out.versions) + } + if ( !params.skip_concoct ){ + + ch_concoct_input = assemblies + .map { meta, bins, bams, bais -> + def meta_new = meta + [binner: 'CONCOCT'] + [ meta_new, bins, bams, bais ] + } + .multiMap { + meta, bins, bams, bais -> + bins: [ meta, bins ] + bams: [ meta, bams, bais ] + } + + FASTA_BINNING_CONCOCT ( ch_concoct_input.bins, ch_concoct_input.bams ) + ch_final_bins_for_gunzip = ch_final_bins_for_gunzip.mix( FASTA_BINNING_CONCOCT.out.bins.transpose() ) + ch_binning_results_gzipped_final = ch_binning_results_gzipped_final.mix( FASTA_BINNING_CONCOCT.out.bins ) + ch_versions = ch_versions.mix(FASTA_BINNING_CONCOCT.out.versions) + } + + // decide which unbinned fasta files to further filter, depending on which binners selected + // NOTE: CONCOCT does not produce 'unbins' itself, therefore not included here. + if ( !params.skip_metabat2 && params.skip_maxbin2 ) { + ch_input_splitfasta = METABAT2_METABAT2.out.unbinned + } else if ( params.skip_metabat2 && !params.skip_maxbin2 ) { + ch_input_splitfasta = MAXBIN2.out.unbinned_fasta + } else if ( params.skip_metabat2 && params.skip_maxbin2 ) { + ch_input_splitfasta = Channel.empty() + } else { + ch_input_splitfasta = METABAT2_METABAT2.out.unbinned.mix(MAXBIN2.out.unbinned_fasta) + } + + SPLIT_FASTA ( ch_input_splitfasta ) + // large unbinned contigs from SPLIT_FASTA for decompressing for MAG_DEPTHS, + // first have to separate and re-group due to limitation of GUNZIP module + ch_split_fasta_results_transposed = SPLIT_FASTA.out.unbinned.transpose() + ch_versions = ch_versions.mix(SPLIT_FASTA.out.versions) + + GUNZIP_BINS ( ch_final_bins_for_gunzip ) + ch_binning_results_gunzipped = GUNZIP_BINS.out.gunzip + .groupTuple(by: 0) + + GUNZIP_UNBINS ( ch_split_fasta_results_transposed ) + ch_splitfasta_results_gunzipped = GUNZIP_UNBINS.out.gunzip + .groupTuple(by: 0) + + ch_versions = ch_versions.mix(GUNZIP_BINS.out.versions.first()) + ch_versions = ch_versions.mix(GUNZIP_UNBINS.out.versions.first()) + + emit: + bins = ch_binning_results_gunzipped + bins_gz = ch_binning_results_gzipped_final + unbinned = ch_splitfasta_results_gunzipped + unbinned_gz = SPLIT_FASTA.out.unbinned + metabat2depths = METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS.out.depth + versions = ch_versions +} diff --git a/subworkflows/local/binning_preparation.nf b/subworkflows/local/binning_preparation.nf new file mode 100644 index 00000000..60f63a26 --- /dev/null +++ b/subworkflows/local/binning_preparation.nf @@ -0,0 +1,51 @@ +/* + * Binning preparation with Bowtie2 + */ + +include { BOWTIE2_ASSEMBLY_BUILD } from '../../modules/local/bowtie2_assembly_build' +include { BOWTIE2_ASSEMBLY_ALIGN } from '../../modules/local/bowtie2_assembly_align' + +workflow BINNING_PREPARATION { + take: + assemblies // channel: [ val(meta), path(assembly) ] + reads // channel: [ val(meta), [ reads ] ] + + main: + // build bowtie2 index for all assemblies + BOWTIE2_ASSEMBLY_BUILD ( assemblies ) + + // combine assemblies with sample reads for binning depending on specified mapping mode + if (params.binning_map_mode == 'all'){ + // combine assemblies with reads of all samples + ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index + .combine(reads) + } else if (params.binning_map_mode == 'group'){ + // combine assemblies with reads of samples from same group + ch_reads_bowtie2 = reads.map{ meta, reads -> [ meta.group, meta, reads ] } + ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index + .map { meta, assembly, index -> [ meta.group, meta, assembly, index ] } + .combine(ch_reads_bowtie2, by: 0) + .map { group, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] } + + } else { + // i.e. --binning_map_mode 'own' + // combine assemblies (not co-assembled) with reads from own sample + ch_reads_bowtie2 = reads.map{ meta, reads -> [ meta.id, meta, reads ] } + ch_bowtie2_input = BOWTIE2_ASSEMBLY_BUILD.out.assembly_index + .map { meta, assembly, index -> [ meta.id, meta, assembly, index ] } + .combine(ch_reads_bowtie2, by: 0) + .map { id, assembly_meta, assembly, index, reads_meta, reads -> [ assembly_meta, assembly, index, reads_meta, reads ] } + + } + + BOWTIE2_ASSEMBLY_ALIGN ( ch_bowtie2_input ) + // group mappings for one assembly + ch_grouped_mappings = BOWTIE2_ASSEMBLY_ALIGN.out.mappings + .groupTuple(by: 0) + .map { meta, assembly, bams, bais -> [ meta, assembly.sort()[0], bams, bais ] } // multiple symlinks to the same assembly -> use first of sorted list + + emit: + bowtie2_assembly_multiqc = BOWTIE2_ASSEMBLY_ALIGN.out.log.map { assembly_meta, reads_meta, log -> [ log ] } + bowtie2_version = BOWTIE2_ASSEMBLY_ALIGN.out.versions + grouped_mappings = ch_grouped_mappings +} diff --git a/subworkflows/local/binning_refinement.nf b/subworkflows/local/binning_refinement.nf new file mode 100644 index 00000000..f92bf0cb --- /dev/null +++ b/subworkflows/local/binning_refinement.nf @@ -0,0 +1,117 @@ +/* + * Binning with MetaBAT2 and MaxBin2 + */ + +include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_METABAT2 } from '../../modules/nf-core/dastool/fastatocontig2bin/main.nf' +include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_MAXBIN2 } from '../../modules/nf-core/dastool/fastatocontig2bin/main.nf' +include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_CONCOCT } from '../../modules/nf-core/dastool/fastatocontig2bin/main.nf' +include { DASTOOL_DASTOOL } from '../../modules/nf-core/dastool/dastool/main.nf' +include { RENAME_PREDASTOOL } from '../../modules/local/rename_predastool' +include { RENAME_POSTDASTOOL } from '../../modules/local/rename_postdastool' + +/* + * Get number of columns in file (first line) + */ + +workflow BINNING_REFINEMENT { + take: + ch_contigs_for_dastool // channel: [ val(meta), path(contigs) ] + bins // channel: [ val(meta), path(bins) ] + + main: + ch_versions = Channel.empty() + + // remove domain information, will add it back later + // everything here is either unclassified or a prokaryote + ch_bins = bins + .map { meta, bins -> + def meta_new = meta - meta.subMap(['domain','refinement']) + [meta_new, bins] + } + .groupTuple() + .map { + meta, bins -> [meta, bins.flatten()] + } + + // prepare bins + ch_bins_for_fastatocontig2bin = RENAME_PREDASTOOL(ch_bins).renamed_bins + .branch { + metabat2: it[0]['binner'] == 'MetaBAT2' + maxbin2: it[0]['binner'] == 'MaxBin2' + concoct: it[0]['binner'] == 'CONCOCT' + } + + // Generate DASTool auxilary files + DASTOOL_FASTATOCONTIG2BIN_METABAT2 ( ch_bins_for_fastatocontig2bin.metabat2, "fa") + // MaxBin2 bin extension was changed to 'fa' as well in RENAME_PREDASTOOL + DASTOOL_FASTATOCONTIG2BIN_MAXBIN2 ( ch_bins_for_fastatocontig2bin.maxbin2, "fa") + DASTOOL_FASTATOCONTIG2BIN_CONCOCT ( ch_bins_for_fastatocontig2bin.concoct, "fa") + + // Run DASTOOL + ch_fastatocontig2bin_for_dastool = Channel.empty() + ch_fastatocontig2bin_for_dastool = ch_fastatocontig2bin_for_dastool + .mix(DASTOOL_FASTATOCONTIG2BIN_METABAT2.out.fastatocontig2bin) + .mix(DASTOOL_FASTATOCONTIG2BIN_MAXBIN2.out.fastatocontig2bin) + .mix(DASTOOL_FASTATOCONTIG2BIN_CONCOCT.out.fastatocontig2bin) + .map { + meta, fastatocontig2bin -> + def meta_new = meta - meta.subMap('binner') + [ meta_new, fastatocontig2bin ] + } + .groupTuple(by: 0) + + // Note: do not `failOnMismatch` on join here, in some cases e.g. MAXBIN2 will fail if no bins, so cannot join! + // Only want to join for DAS_Tool on bins that 'exist' + + ch_input_for_dastool = ch_contigs_for_dastool.join(ch_fastatocontig2bin_for_dastool, by: 0) + + ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_METABAT2.out.versions.first()) + ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_MAXBIN2.out.versions.first()) + ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_CONCOCT.out.versions.first()) + + // Run DAStool + DASTOOL_DASTOOL(ch_input_for_dastool, [], []) + ch_versions = ch_versions.mix(DASTOOL_DASTOOL.out.versions.first()) + + // Prepare bins for downstream analysis (separate from unbins, add 'binner' info and group) + // use DASTool as 'binner' info allowing according grouping of refined bin sets, + // while keeping information about original binning method in filenames and used binnames, e.g. "*-MaxBin2Refined-*.fa" + // (alternatively one could think of adding, for example, meta.orig_binner, if this would simplify code) + ch_dastool_bins_newmeta = DASTOOL_DASTOOL.out.bins.transpose() + .map { + meta, bin -> + if (bin.name != "unbinned.fa") { + def meta_new = meta + [binner: 'DASTool'] + [ meta_new, bin ] + } + } + .groupTuple() + .map { + meta, bins -> + def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' + def meta_new = meta + [refinement: 'dastool_refined', domain: domain_class] + [ meta_new, bins ] + } + + ch_input_for_renamedastool = DASTOOL_DASTOOL.out.bins + .map { + meta, bins -> + def domain_class = params.bin_domain_classification ? 'prokarya' : 'unclassified' + def meta_new = meta + [refinement: 'dastool_refined', binner: 'DASTool', domain: domain_class] + [ meta_new, bins ] + } + + RENAME_POSTDASTOOL ( ch_input_for_renamedastool ) + + refined_unbins = RENAME_POSTDASTOOL.out.refined_unbins + .map { + meta, bins -> + def meta_new = meta + [refinement: 'dastool_refined_unbinned'] + [meta_new, bins] + } + + emit: + refined_bins = ch_dastool_bins_newmeta + refined_unbins = refined_unbins + versions = ch_versions +} diff --git a/subworkflows/local/depths.nf b/subworkflows/local/depths.nf new file mode 100644 index 00000000..a2b69c95 --- /dev/null +++ b/subworkflows/local/depths.nf @@ -0,0 +1,86 @@ +include { MAG_DEPTHS } from '../../modules/local/mag_depths' +include { MAG_DEPTHS_PLOT } from '../../modules/local/mag_depths_plot' +include { MAG_DEPTHS_SUMMARY } from '../../modules/local/mag_depths_summary' + +/* + * Get number of columns in file (first line) + */ +def getColNo(filename) { + lines = file(filename).readLines() + return lines[0].split('\t').size() +} + +/* + * Get number of rows in a file + */ +def getRowNo(filename) { + lines = file(filename).readLines() + return lines.size() +} + +workflow DEPTHS { + take: + bins_unbins //channel: val(meta), [ path(bins) ] + depths //channel: val(meta), path(depths) + reads //channel: val(meta), path(reads) + + main: + ch_versions = Channel.empty() + + // Compute bin depths for different samples (according to `binning_map_mode`) + // Create a new meta combine key first, but copy meta so that + // we retain the information about binners and domain classification + ch_depth_input = bins_unbins + .map { + meta, bins -> + def meta_combine = meta - meta.subMap('binner','domain','refinement') + [meta_combine, meta, bins] + } + .groupTuple() + .combine(depths, by: 0) + .transpose() + .map { + meta_combine, meta, bins, depth -> + def meta_new = meta - meta.subMap('domain','refinement') + [meta_new, bins, depth] + } + .groupTuple(by: [0,2]) + .map { + meta, bins, depth -> + [meta, bins.unique().flatten(), depth] + } + + + + MAG_DEPTHS ( ch_depth_input ) + ch_versions = ch_versions.mix(MAG_DEPTHS.out.versions) + + // Plot bin depths heatmap for each assembly and mapped samples (according to `binning_map_mode`) + // create file containing group information for all samples + ch_sample_groups = reads + .collectFile(name:'sample_groups.tsv'){ meta, reads -> meta.id + '\t' + meta.group + '\n' } + + // Filter MAG depth files: use only those for plotting that contain depths for > 2 samples + // as well as > 2 bins + ch_mag_depths_plot = MAG_DEPTHS.out.depths + .map { meta, bin_depths_file -> + if (getColNo(bin_depths_file) > 2 && getRowNo(bin_depths_file) > 2) [ meta, bin_depths_file ] + } + + MAG_DEPTHS_PLOT ( ch_mag_depths_plot, ch_sample_groups.collect() ) + + //Depth files that are coming from bins and failed binning refinement are concatenated per meta + ch_mag_depth_out = MAG_DEPTHS.out.depths + .collectFile(keepHeader: true) { + meta, depth -> + [meta.id, depth] + } + + MAG_DEPTHS_SUMMARY ( ch_mag_depth_out.collect() ) + ch_versions = ch_versions.mix( MAG_DEPTHS_PLOT.out.versions ) + ch_versions = ch_versions.mix( MAG_DEPTHS_SUMMARY.out.versions ) + + emit: + depths_summary = MAG_DEPTHS_SUMMARY.out.summary + versions = ch_versions +} diff --git a/subworkflows/local/domain_classification.nf b/subworkflows/local/domain_classification.nf new file mode 100644 index 00000000..38291888 --- /dev/null +++ b/subworkflows/local/domain_classification.nf @@ -0,0 +1,28 @@ +/* +* Domain classification with Tiara +*/ + +include { TIARA } from '../../subworkflows/local/tiara' + +workflow DOMAIN_CLASSIFICATION { + take: + assemblies // tuple val(meta), path(assembly) + bins // tuple val(meta), path( [ bins ] ) + unbins // tuple val(meta), path( [ unbins ] ) + + main: + ch_versions = Channel.empty() + + if ( params.bin_domain_classification_tool == "tiara") { + TIARA (assemblies, bins, unbins) + } + + ch_classified_bins = TIARA.out.classified_bins + ch_classified_unbins = TIARA.out.classified_unbins + ch_versions = ch_versions.mix(TIARA.out.versions) + + emit: + classified_bins = ch_classified_bins + classified_unbins = ch_classified_unbins + versions = ch_versions +} diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf new file mode 100644 index 00000000..d3d66d47 --- /dev/null +++ b/subworkflows/local/gtdbtk.nf @@ -0,0 +1,107 @@ +/* + * GTDB-Tk bin classification, using BUSCO QC to filter bins + */ + +include { GTDBTK_DB_PREPARATION } from '../../modules/local/gtdbtk_db_preparation' +include { GTDBTK_CLASSIFYWF } from '../../modules/nf-core/gtdbtk/classifywf/main' +include { GTDBTK_SUMMARY } from '../../modules/local/gtdbtk_summary' + +workflow GTDBTK { + take: + bins // channel: [ val(meta), [bins] ] + bin_qc_summary // channel: path + gtdb // channel: path + gtdb_mash // channel: path + + main: + // Filter bins: classify only medium & high quality MAGs + ch_bin_metrics = Channel.empty() + if ( params.binqc_tool == 'busco' ){ + // Collect completeness and contamination metrics from busco summary + ch_bin_metrics = bin_qc_summary + .splitCsv(header: true, sep: '\t') + .map { row -> + def completeness = -1 + def contamination = -1 + def missing, duplicated + if (params.busco_db && file(params.busco_db).getBaseName().contains('odb10')) { + missing = row.'%Missing (specific)' // TODO or just take '%Complete'? + duplicated = row.'%Complete and duplicated (specific)' + } else { + missing = row.'%Missing (domain)' + duplicated = row.'%Complete and duplicated (domain)' + } + if (missing != '') completeness = 100.0 - Double.parseDouble(missing) + if (duplicated != '') contamination = Double.parseDouble(duplicated) + [row.'GenomeBin', completeness, contamination] + } + } else { + // Collect completeness and contamination metrics from CheckM/CheckM2 summary + bin_name = params.binqc_tool == 'checkm' ? 'Bin Id' : 'Name' + + ch_bin_metrics = bin_qc_summary + .splitCsv(header: true, sep: '\t') + .map { row -> + def completeness = Double.parseDouble(row.'Completeness') + def contamination = Double.parseDouble(row.'Contamination') + [row[bin_name] + ".fa", completeness, contamination] + } + } + + + // Filter bins based on collected metrics: completeness, contamination + ch_filtered_bins = bins + .transpose() + .map { meta, bin -> [bin.getName(), bin, meta]} + .join(ch_bin_metrics, failOnDuplicate: true) + .map { bin_name, bin, meta, completeness, contamination -> [meta, bin, completeness, contamination] } + .branch { + passed: (it[2] != -1 && it[2] >= params.gtdbtk_min_completeness && it[3] != -1 && it[3] <= params.gtdbtk_max_contamination) + return [it[0], it[1]] + discarded: (it[2] == -1 || it[2] < params.gtdbtk_min_completeness || it[3] == -1 || it[3] > params.gtdbtk_max_contamination) + return [it[0], it[1]] + } + + if ( gtdb.extension == 'gz' ) { + // Expects to be tar.gz! + ch_db_for_gtdbtk = GTDBTK_DB_PREPARATION ( gtdb ).db + } else if ( gtdb.isDirectory() ) { + // The classifywf module expects a list of the _contents_ of the GTDB + // database, not just the directory itself (I'm not sure why). But + // for now we generate this list before putting into a channel, + // then grouping again to pass to the module. + // Then make up meta id to match expected channel cardinality for GTDBTK + gtdb_dir = gtdb.listFiles() + ch_db_for_gtdbtk = Channel + .of(gtdb_dir) + .collect() + .map { ["gtdb", it] } + } else { + error("Unsupported object given to --gtdb, database must be supplied as either a directory or a .tar.gz file!") + } + + + // Print warning why GTDB-TK summary empty if passed channel gets no files + ch_filtered_bins.passed + .count() + .map{it == 0 ? log.warn("No contigs passed GTDB-TK min. completeness filters. GTDB-TK summary will execute but results will be empty!") : ""} + + + GTDBTK_CLASSIFYWF ( + ch_filtered_bins.passed.groupTuple(), + ch_db_for_gtdbtk, + params.gtdbtk_pplacer_useram ? false : true, + gtdb_mash + ) + + GTDBTK_SUMMARY ( + ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]), + GTDBTK_CLASSIFYWF.out.summary.map{it[1]}.collect().ifEmpty([]), + [], + [] + ) + + emit: + summary = GTDBTK_SUMMARY.out.summary + versions = GTDBTK_CLASSIFYWF.out.versions +} diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf new file mode 100644 index 00000000..7de6dd25 --- /dev/null +++ b/subworkflows/local/longread_preprocessing.nf @@ -0,0 +1,109 @@ +/* + * LONGREAD_PREPROCESSING: Preprocessing and QC for long reads + */ + +include { NANOPLOT as NANOPLOT_RAW } from '../../modules/nf-core/nanoplot/main' +include { NANOPLOT as NANOPLOT_FILTERED } from '../../modules/nf-core/nanoplot/main' +include { NANOLYSE } from '../../modules/nf-core/nanolyse/main' +include { PORECHOP_PORECHOP } from '../../modules/nf-core/porechop/porechop/main' +include { PORECHOP_ABI } from '../../modules/nf-core/porechop/abi/main' +include { FILTLONG } from '../../modules/nf-core/filtlong' +include { CHOPPER } from '../../modules/nf-core/chopper' +include { NANOQ } from '../../modules/nf-core/nanoq' + +workflow LONGREAD_PREPROCESSING { + take: + ch_raw_long_reads // [ [meta] , fastq] (mandatory) + ch_short_reads // [ [meta] , fastq1, fastq2] (mandatory) + ch_lambda_db // [fasta] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + NANOPLOT_RAW ( + ch_raw_long_reads + ) + ch_versions = ch_versions.mix(NANOPLOT_RAW.out.versions.first()) + + ch_long_reads = ch_raw_long_reads + .map { + meta, reads -> + def meta_new = meta - meta.subMap('run') + [ meta_new, reads ] + } + + if ( !params.assembly_input ) { + if (!params.skip_adapter_trimming) { + if (params.longread_adaptertrimming_tool && + params.longread_adaptertrimming_tool == 'porechop_abi') { + PORECHOP_ABI ( + ch_raw_long_reads + ) + ch_long_reads = PORECHOP_ABI.out.reads + ch_versions = ch_versions.mix(PORECHOP_ABI.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_ABI.out.log ) + } else if (params.longread_adaptertrimming_tool == 'porechop') { + PORECHOP_PORECHOP ( + ch_raw_long_reads + ) + ch_long_reads = PORECHOP_PORECHOP.out.reads + ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log ) + } + } + + if (!params.keep_lambda && params.longread_filtering_tool != 'chopper') { + NANOLYSE ( + ch_long_reads, + ch_lambda_db + ) + ch_long_reads = NANOLYSE.out.fastq + ch_versions = ch_versions.mix(NANOLYSE.out.versions.first()) + } + + if (params.longread_filtering_tool == 'filtlong') { + // join long and short reads by sample name + ch_short_reads_tmp = ch_short_reads + .map { meta, sr -> [ meta.id, meta, sr ] } + + ch_short_and_long_reads = ch_long_reads + .map { meta, lr -> [ meta.id, meta, lr ] } + .join(ch_short_reads_tmp, by: 0) + .map { id, meta_lr, lr, meta_sr, sr -> [ meta_lr, sr, lr ] } // should not occur for single-end, since SPAdes (hybrid) does not support single-end + + FILTLONG ( + ch_short_and_long_reads + ) + ch_long_reads = FILTLONG.out.reads + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) + } else if (params.longread_filtering_tool == 'nanoq') { + NANOQ ( + ch_long_reads, + 'fastq.gz' + ) + ch_long_reads = NANOQ.out.reads + ch_versions = ch_versions.mix(NANOQ.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(NANOQ.out.stats) + } else if (params.longread_filtering_tool == 'chopper') { + CHOPPER ( + ch_long_reads, + ch_lambda_db.ifEmpty([]) + ) + ch_long_reads = CHOPPER.out.fastq + ch_versions = ch_versions.mix(CHOPPER.out.versions.first()) + } + + NANOPLOT_FILTERED ( + ch_long_reads + ) + + ch_versions = ch_versions.mix(NANOPLOT_FILTERED.out.versions.first()) + } + + emit: + long_reads = ch_long_reads + versions = ch_versions + multiqc_files = ch_multiqc_files +} diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf new file mode 100644 index 00000000..ad33b56f --- /dev/null +++ b/subworkflows/local/shortread_preprocessing.nf @@ -0,0 +1,178 @@ +/* + * SHORTREAD_PREPROCESSING: Preprocessing and QC for short reads + */ + +include { FASTQC as FASTQC_RAW } from '../../modules/nf-core/fastqc/main' +include { FASTQC as FASTQC_TRIMMED } from '../../modules/nf-core/fastqc/main' +include { FASTP } from '../../modules/nf-core/fastp/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_PE } from '../../modules/nf-core/adapterremoval/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_SE } from '../../modules/nf-core/adapterremoval/main' +include { BOWTIE2_REMOVAL_BUILD as BOWTIE2_HOST_REMOVAL_BUILD } from '../../modules/local/bowtie2_removal_build' +include { BOWTIE2_REMOVAL_ALIGN as BOWTIE2_HOST_REMOVAL_ALIGN } from '../../modules/local/bowtie2_removal_align' +include { BOWTIE2_REMOVAL_BUILD as BOWTIE2_PHIX_REMOVAL_BUILD } from '../../modules/local/bowtie2_removal_build' +include { BOWTIE2_REMOVAL_ALIGN as BOWTIE2_PHIX_REMOVAL_ALIGN } from '../../modules/local/bowtie2_removal_align' +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' +include { SEQTK_MERGEPE } from '../../modules/nf-core/seqtk/mergepe/main' +include { BBMAP_BBNORM } from '../../modules/nf-core/bbmap/bbnorm/main' + +workflow SHORTREAD_PREPROCESSING { + take: + ch_raw_short_reads // [ [meta] , fastq1, fastq2] (mandatory) + ch_host_fasta // [fasta] (optional) + ch_phix_db_file // [fasta] (optional) + ch_metaeuk_db // [fasta] (optional) + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + FASTQC_RAW( + ch_raw_short_reads + ) + ch_versions = ch_versions.mix(FASTQC_RAW.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_RAW.out.zip) + + if (!params.skip_clipping) { + if (params.clip_tool == 'fastp') { + FASTP( + ch_raw_short_reads, + [], + params.fastp_save_trimmed_fail, + [] + ) + ch_short_reads_prepped = FASTP.out.reads + ch_versions = ch_versions.mix(FASTP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(FASTP.out.json) + + } + else if (params.clip_tool == 'adapterremoval') { + + // due to strange output file scheme in AR2, have to manually separate + // SE/PE to allow correct pulling of reads after. + ch_adapterremoval_in = ch_raw_short_reads.branch { + single: it[0]['single_end'] + paired: !it[0]['single_end'] + } + + ADAPTERREMOVAL_PE(ch_adapterremoval_in.paired, []) + ADAPTERREMOVAL_SE(ch_adapterremoval_in.single, []) + + ch_short_reads_prepped = Channel.empty() + ch_short_reads_prepped = ch_short_reads_prepped.mix(ADAPTERREMOVAL_SE.out.singles_truncated, ADAPTERREMOVAL_PE.out.paired_truncated) + + ch_versions = ch_versions.mix(ADAPTERREMOVAL_PE.out.versions.first(), ADAPTERREMOVAL_SE.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(ADAPTERREMOVAL_PE.out.settings) + ch_multiqc_files = ch_multiqc_files.mix(ADAPTERREMOVAL_SE.out.settings) + } + } + else { + ch_short_reads_prepped = ch_raw_short_reads + } + + if (params.host_fasta) { + if (params.host_fasta_bowtie2index) { + ch_host_bowtie2index = file(params.host_fasta_bowtie2index, checkIfExists: true) + } + else { + BOWTIE2_HOST_REMOVAL_BUILD( + ch_host_fasta + ) + ch_host_bowtie2index = BOWTIE2_HOST_REMOVAL_BUILD.out.index + } + } + + if (params.host_fasta || params.host_genome) { + BOWTIE2_HOST_REMOVAL_ALIGN( + ch_short_reads_prepped, + ch_host_bowtie2index + ) + ch_short_reads_hostremoved = BOWTIE2_HOST_REMOVAL_ALIGN.out.reads + ch_versions = ch_versions.mix(BOWTIE2_HOST_REMOVAL_ALIGN.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_HOST_REMOVAL_ALIGN.out.log) + } + else { + ch_short_reads_hostremoved = ch_short_reads_prepped + } + + if (!params.keep_phix) { + BOWTIE2_PHIX_REMOVAL_BUILD( + ch_phix_db_file + ) + BOWTIE2_PHIX_REMOVAL_ALIGN( + ch_short_reads_hostremoved, + BOWTIE2_PHIX_REMOVAL_BUILD.out.index + ) + ch_short_reads_phixremoved = BOWTIE2_PHIX_REMOVAL_ALIGN.out.reads + ch_versions = ch_versions.mix(BOWTIE2_PHIX_REMOVAL_ALIGN.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix(BOWTIE2_PHIX_REMOVAL_ALIGN.out.log) + } + else { + ch_short_reads_phixremoved = ch_short_reads_hostremoved + } + + if (!(params.keep_phix && params.skip_clipping && !(params.host_genome || params.host_fasta))) { + FASTQC_TRIMMED( + ch_short_reads_phixremoved + ) + ch_versions = ch_versions.mix(FASTQC_TRIMMED.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMMED.out.zip) + } + + // Run/Lane merging + + ch_short_reads_forcat = ch_short_reads_phixremoved + .map { meta, reads -> + def meta_new = meta - meta.subMap('run') + [meta_new, reads] + } + .groupTuple() + .branch { meta, reads -> + cat: reads.size() >= 2 + skip_cat: true + } + + CAT_FASTQ(ch_short_reads_forcat.cat.map { meta, reads -> [meta, reads.flatten()] }) + + // Ensure we don't have nests of nests so that structure is in form expected for assembly + ch_short_reads_catskipped = ch_short_reads_forcat.skip_cat.map { meta, reads -> + def new_reads = meta.single_end ? reads[0] : reads.flatten() + [meta, new_reads] + } + + // Combine single run and multi-run-merged data + ch_short_reads = Channel.empty() + ch_short_reads = CAT_FASTQ.out.reads.mix(ch_short_reads_catskipped) + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) + + if (params.bbnorm) { + if (params.coassemble_group) { + // Interleave pairs, to be able to treat them as single ends when calling bbnorm. This prepares + // for dropping the single_end parameter, but keeps assembly modules as they are, i.e. not + // accepting a mix of single end and pairs. + SEQTK_MERGEPE( + ch_short_reads.filter { !it[0].single_end } + ) + ch_versions = ch_versions.mix(SEQTK_MERGEPE.out.versions.first()) + // Combine the interleaved pairs with any single end libraries. Set the meta.single_end to true (used by the bbnorm module). + ch_bbnorm = SEQTK_MERGEPE.out.reads + .mix(ch_short_reads.filter { it[0].single_end }) + .map { [[id: sprintf("group%s", it[0].group), group: it[0].group, single_end: true], it[1]] } + .groupTuple() + } + else { + ch_bbnorm = ch_short_reads + } + BBMAP_BBNORM(ch_bbnorm) + ch_versions = ch_versions.mix(BBMAP_BBNORM.out.versions) + ch_short_reads_assembly = BBMAP_BBNORM.out.fastq + } + else { + ch_short_reads_assembly = ch_short_reads + } + + emit: + short_reads = ch_short_reads + short_reads_assembly = ch_short_reads_assembly + versions = ch_versions + multiqc_files = ch_multiqc_files +} diff --git a/subworkflows/local/tiara.nf b/subworkflows/local/tiara.nf new file mode 100644 index 00000000..ab274cc8 --- /dev/null +++ b/subworkflows/local/tiara.nf @@ -0,0 +1,128 @@ +include { TIARA_TIARA } from '../../modules/nf-core/tiara/tiara/main' +include { TIARA_CLASSIFY } from '../../modules/local/tiara_classify' +include { DASTOOL_FASTATOCONTIG2BIN as DASTOOL_FASTATOCONTIG2BIN_TIARA } from '../../modules/nf-core/dastool/fastatocontig2bin/main' +include { COMBINE_TSV as TIARA_SUMMARY } from '../../modules/local/combine_tsv' + +workflow TIARA { + take: + assemblies // tuple val(meta), path(assembly) + bins // tuple val(meta), path( [ bins ] ) + unbins // tuple val(meta), path( [ unbins ] ) + + main: + ch_versions = Channel.empty() + + bins = bins + .map { meta, bins -> + def meta_new = meta + [bin: 'bins'] + meta_new.bin = 'bins' + [meta_new, bins] + } + + unbins = unbins + .map { meta, unbins -> + def meta_new = meta + [bin: 'unbins'] + [meta_new, unbins] + } + + ch_tiara_input = bins.mix(unbins) + + TIARA_TIARA ( assemblies ) + ch_versions = ch_versions.mix(TIARA_TIARA.out.versions.first()) + + // Need contig2bin file for each bin group + DASTOOL_FASTATOCONTIG2BIN_TIARA ( ch_tiara_input , 'fa') + ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_TIARA.out.versions.first()) + + // Need to per-assembly Tiara classifications to their bins + // Have to remove binner information from the meta map to do this + ch_contigs_to_bin_tiara = DASTOOL_FASTATOCONTIG2BIN_TIARA.out.fastatocontig2bin + .combine(ch_tiara_input, by: 0) + .map { meta, contig2bin, bins -> + def meta_join = meta - meta.subMap('binner', 'bin') + [ meta_join, meta, contig2bin, bins ] + } + + ch_tiara_classify_input = ch_contigs_to_bin_tiara + .combine( TIARA_TIARA.out.classifications, by: 0) + .map { meta_join, meta, contig2bin, bins, classifications -> + [ meta, classifications, contig2bin, bins ] + } + + TIARA_CLASSIFY( ch_tiara_classify_input ) + ch_versions = ch_versions.mix(TIARA_CLASSIFY.out.versions.first()) + + ch_eukarya_bins = TIARA_CLASSIFY.out.eukarya_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'eukarya'] + [meta_new, bins] + } + + ch_prokarya_bins = TIARA_CLASSIFY.out.prokarya_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'prokarya'] + [meta_new, bins] + } + + ch_bacteria_bins = TIARA_CLASSIFY.out.bacteria_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'bacteria'] + [meta_new, bins] + } + + ch_archaea_bins = TIARA_CLASSIFY.out.archaea_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'archaea'] + [meta_new, bins] + } + + ch_organelle_bins = TIARA_CLASSIFY.out.organelle_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'organelle'] + [meta_new, bins] + } + + ch_unknown_bins = TIARA_CLASSIFY.out.unknown_bins + .map { meta, bins -> + def meta_new = meta + [domain: 'unknown'] + [meta_new, bins] + } + + ch_classified_bins_unbins = ch_eukarya_bins + .mix(ch_prokarya_bins) + .mix(ch_bacteria_bins) + .mix(ch_archaea_bins) + .mix(ch_organelle_bins) + .mix(ch_unknown_bins) + + ch_classified_bins = ch_classified_bins_unbins + .filter { meta, bins -> + meta.bin == "bins" + } + .map { meta, bins -> + def meta_new = meta - meta.subMap('bin') + [meta_new, bins] + } + + ch_classified_unbins = ch_classified_bins_unbins + .filter { meta, bins -> + meta.bin == "unbins" + } + .map { meta, bins -> + def meta_new = meta - meta.subMap('bin') + [meta_new, bins] + } + + ch_bin_classifications = TIARA_CLASSIFY.out.bin_classifications + .map { meta, classification -> + [ classification ] + } + .collect() + + TIARA_SUMMARY(ch_bin_classifications) + + emit: + classified_bins = ch_classified_bins + classified_unbins = ch_classified_unbins + versions = ch_versions +} diff --git a/subworkflows/local/utils_nfcore_mag_pipeline/main.nf b/subworkflows/local/utils_nfcore_mag_pipeline/main.nf index 15149029..254658ec 100644 --- a/subworkflows/local/utils_nfcore_mag_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_mag_pipeline/main.nf @@ -1,4 +1,3 @@ -// // Subworkflow with functionality specific to the nf-core/mag pipeline // @@ -8,14 +7,14 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' -include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' -include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' -include { imNotification } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' +include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' +include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' +include { imNotification } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -24,11 +23,10 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin */ workflow PIPELINE_INITIALISATION { - take: version // boolean: Display version and exit validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs + _monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet @@ -40,7 +38,7 @@ workflow PIPELINE_INITIALISATION { // // Print version and exit if required and dump pipeline parameters to JSON file // - UTILS_NEXTFLOW_PIPELINE ( + UTILS_NEXTFLOW_PIPELINE( version, true, outdir, @@ -50,7 +48,7 @@ workflow PIPELINE_INITIALISATION { // // Validate parameters and generate parameter summary to stdout // - UTILS_NFSCHEMA_PLUGIN ( + UTILS_NFSCHEMA_PLUGIN( workflow, validate_params, null @@ -59,42 +57,104 @@ workflow PIPELINE_INITIALISATION { // // Check config provided to the pipeline // - UTILS_NFCORE_PIPELINE ( + UTILS_NFCORE_PIPELINE( nextflow_cli_args ) - // - // Custom validation for pipeline parameters - // - validateInputParameters() + // Note: normally validateInputParameters() goes here, but + // as we need to use information from samplesheet from the input channel + // moved it below // - // Create channel from input file provided through params.input + // Create channels from input file provided through params.input and params.assembly_input // - Channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) + // Validate FASTQ input + ch_samplesheet = Channel + .fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } + validateInputSamplesheet(it[0], it[1], it[2], it[3]) } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) + + // Prepare FASTQs channel and separate short and long reads and prepare + ch_raw_short_reads = ch_samplesheet.map { meta, sr1, sr2, _lr -> + meta.run = meta.run == [] ? "0" : meta.run + meta.single_end = params.single_end + + if (params.single_end) { + return [meta, [sr1]] } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] + else { + return [meta, [sr1, sr2]] + } + } + + ch_raw_long_reads = ch_samplesheet.map { meta, _sr1, _sr2, lr -> + if (lr) { + meta.run = meta.run == [] ? "0" : meta.run + return [meta, lr] + } + } + + // Check already if long reads are provided, for later parameter validation + def hybrid = false + ch_raw_long_reads.map { + if (it) { + hybrid = true } - .set { ch_samplesheet } + } + + // + // Custom validation for pipeline parameters + // + validateInputParameters( + hybrid + ) + + // Validate PRE-ASSEMBLED CONTIG input when supplied + if (params.assembly_input) { + ch_input_assemblies = Channel.fromList(samplesheetToList(params.assembly_input, "${projectDir}/assets/schema_assembly_input.json")) + } + + // Prepare ASSEMBLY input channel + if (params.assembly_input) { + ch_input_assemblies.map { meta, fasta -> + return [meta + [id: params.coassemble_group ? "group-${meta.group}" : meta.id], [fasta]] + } + } + else { + ch_input_assemblies = Channel.empty() + } + + // Cross validation of input assembly and read IDs: ensure groups are all represented between reads and assemblies + if (params.assembly_input) { + ch_read_ids = ch_samplesheet + .map { meta, _sr1, _sr2, _lr -> params.coassemble_group ? meta.group : meta.id } + .unique() + .toList() + .sort() + + ch_assembly_ids = ch_input_assemblies + .map { meta, _fasta -> params.coassemble_group ? meta.group : meta.id } + .unique() + .toList() + .sort() + + ch_read_ids + .concat(ch_assembly_ids) + .collect(flat: false) + .map { ids1, ids2 -> + if (ids1.sort() != ids2.sort()) { + exit(1, "[nf-core/mag] ERROR: supplied IDs or Groups in read and assembly CSV files do not match!") + } + } + } emit: - samplesheet = ch_samplesheet - versions = ch_versions + raw_short_reads = ch_raw_short_reads + raw_long_reads = ch_raw_long_reads + input_assemblies = ch_input_assemblies + versions = ch_versions } /* @@ -104,7 +164,6 @@ workflow PIPELINE_INITIALISATION { */ workflow PIPELINE_COMPLETION { - take: email // string: email address email_on_fail // string: email address sent on pipeline failure @@ -141,7 +200,7 @@ workflow PIPELINE_COMPLETION { } workflow.onError { - log.error "Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting" + log.error("Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting") } } @@ -153,31 +212,140 @@ workflow PIPELINE_COMPLETION { // // Check and validate pipeline parameters // -def validateInputParameters() { +def validateInputParameters(hybrid) { genomeExistsError() + + // Check if binning mapping mode is valid + if (params.coassemble_group && params.binning_map_mode == 'own') { + error("[nf-core/mag] ERROR: Invalid combination of parameter '--binning_map_mode own' and parameter '--coassemble_group'. Select either 'all' or 'group' mapping mode when performing group-wise co-assembly.") + } + + // Check if settings concerning reproducibility of used tools are consistent and print warning if not + if (params.megahit_fix_cpu_1 || params.spades_fix_cpus != -1 || params.spadeshybrid_fix_cpus != -1) { + if (!params.skip_spades && params.spades_fix_cpus == -1) { + log.warn("[nf-core/mag]: At least one assembly process is run with a parameter to ensure reproducible results, but SPAdes not. Consider using the parameter '--spades_fix_cpus'.") + } + if (hybrid && params.skip_spadeshybrid && params.spadeshybrid_fix_cpus == -1) { + log.warn("[nf-core/mag]: At least one assembly process is run with a parameter to ensure reproducible results, but SPAdes hybrid not. Consider using the parameter '--spadeshybrid_fix_cpus'.") + } + if (!params.skip_megahit && !params.megahit_fix_cpu_1) { + log.warn("[nf-core/mag]: At least one assembly process is run with a parameter to ensure reproducible results, but MEGAHIT not. Consider using the parameter '--megahit_fix_cpu_1'.") + } + if (!params.skip_binning && params.metabat_rng_seed == 0) { + log.warn("[nf-core/mag]: At least one assembly process is run with a parameter to ensure reproducible results, but for MetaBAT2 a random seed is specified ('--metabat_rng_seed 0'). Consider specifying a positive seed instead.") + } + } + + // Check if SPAdes and single_end + if ((!params.skip_spades || !params.skip_spadeshybrid) && params.single_end) { + log.warn('[nf-core/mag]: metaSPAdes does not support single-end data. SPAdes will be skipped.') + } + + // Check if parameters for host contamination removal are valid + if (params.host_fasta && params.host_genome) { + error('[nf-core/mag] ERROR: Both host fasta reference and iGenomes genome are specified to remove host contamination! Invalid combination, please specify either --host_fasta or --host_genome.') + } + if (hybrid && (params.host_fasta || params.host_genome)) { + log.warn('[nf-core/mag]: Host read removal is only applied to short reads. Long reads might be filtered indirectly by Filtlong, which is set to use read qualities estimated based on k-mer matches to the short, already filtered reads.') + if (params.longreads_length_weight > 1) { + log.warn("[nf-core/mag]: The parameter --longreads_length_weight is ${params.longreads_length_weight}, causing the read length being more important for long read filtering than the read quality. Set --longreads_length_weight to 1 in order to assign equal weights.") + } + } + if (params.host_genome) { + if (!params.genomes) { + error('[nf-core/mag] ERROR: No config file containing genomes provided!') + } + // Check if host genome exists in the config file + if (!params.genomes.containsKey(params.host_genome)) { + error( + '=============================================================================\n' + " Host genome '${params.host_genome}' not found in any config files provided to the pipeline.\n" + ' Currently, the available genome keys are:\n' + " ${params.genomes.keySet().join(', ')}\n" + '===================================================================================' + ) + } + if (!params.genomes[params.host_genome].fasta) { + error("[nf-core/mag] ERROR: No fasta file specified for the host genome ${params.host_genome}!") + } + if (!params.genomes[params.host_genome].bowtie2) { + error("[nf-core/mag] ERROR: No Bowtie 2 index file specified for the host genome ${params.host_genome}!") + } + } + + // Check MetaBAT2 inputs + if (!params.skip_metabat2 && params.min_contig_size < 1500) { + log.warn("[nf-core/mag]: Specified min. contig size under minimum for MetaBAT2. MetaBAT2 will be run with 1500 (other binners not affected). You supplied: --min_contig_size ${params.min_contig_size}") + } + + // Check more than one binner is run for bin refinement (required DAS by Tool) + // If the number of run binners (i.e., number of not-skipped) is more than one, otherwise throw an error + if (params.refine_bins_dastool && !([params.skip_metabat2, params.skip_maxbin2, params.skip_concoct].count(false) > 1)) { + error('[nf-core/mag] ERROR: Bin refinement with --refine_bins_dastool requires at least two binners to be running (not skipped). Check input.') + } + + // Check that bin refinement is actually turned on if any of the refined bins are requested for downstream + if (!params.refine_bins_dastool && params.postbinning_input != 'raw_bins_only') { + error("[nf-core/mag] ERROR: The parameter '--postbinning_input ${params.postbinning_input}' for downstream steps can only be specified if bin refinement is activated with --refine_bins_dastool! Check input.") + } + + // Check if BUSCO parameters combinations are valid + if (params.skip_binqc && params.binqc_tool == 'checkm') { + error("[nf-core/mag] ERROR: Both --skip_binqc and --binqc_tool 'checkm' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.") + } + if (params.skip_binqc) { + if (params.busco_db) { + error("[nf-core/mag] ERROR: Both --skip_binqc and --busco_db are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool 'busco' with --busco_db.") + } + if (params.busco_auto_lineage_prok) { + error("[nf-core/mag] ERROR: Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool 'busco' with --busco_auto_lineage_prok.") + } + } + + if (params.skip_binqc && !params.skip_gtdbtk) { + log.warn('[nf-core/mag]: --skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.') + } + + // Check if CAT parameters are valid + if (params.cat_db && params.cat_db_generate) { + error('[nf-core/mag] ERROR: Invalid combination of parameters --cat_db and --cat_db_generate is specified! Please specify either --cat_db or --cat_db_generate.') + } + if (params.save_cat_db && !params.cat_db_generate) { + error('[nf-core/mag] ERROR: Invalid parameter combination: parameter --save_cat_db specified, but not --cat_db_generate! Note also that the parameter --save_cat_db does not work in combination with --cat_db.') + } + + // Check MetaEuk db paramaters + if (params.metaeuk_mmseqs_db && params.metaeuk_db) { + error('[nf-core/mag] ERROR: Invalid parameter combination: both --metaeuk_mmseqs_db and --metaeuk_db are specified! Please specify either --metaeuk_mmseqs_db or --metaeuk_db.') + } + if (params.save_mmseqs_db && !params.metaeuk_mmseqs_db) { + error('[nf-core/mag] ERROR: Invalid parameter combination: --save_mmseqs_db supplied but no database has been requested for download with --metaeuk_mmseqs_db!') + } + + // Check Prokka parameters + if (params.prokka_with_compliance && !params.prokka_compliance_centre) { + error('[nf-core/mag] ERROR: Invalid parameter combination: running PROKKA with compliance mode requires a centre name specified with `--prokka_compliance_centre `!') + } } // // Validate channels from input samplesheet // -def validateInputSamplesheet(input) { - def (metas, fastqs) = input[1..2] +def validateInputSamplesheet(meta, sr1, sr2, lr) { - // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - def endedness_ok = metas.collect{ meta -> meta.single_end }.unique().size == 1 - if (!endedness_ok) { - error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") + if (!sr2 && !params.single_end) { + error("[nf-core/mag] ERROR: Single-end data must be executed with `--single_end`. Note that it is not possible to mix single- and paired-end data in one run! Check input TSV for sample: ${meta.id}") + } + if (sr2 && params.single_end) { + error("[nf-core/mag] ERROR: Paired-end data must be executed without `--single_end`. Note that it is not possible to mix single- and paired-end data in one run! Check input TSV for sample: ${meta.id}") } - return [ metas[0], fastqs ] + return [meta, sr1, sr2, lr] } + // // Get attribute from genome config file e.g. fasta // def getGenomeAttribute(attribute) { - if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { - if (params.genomes[ params.genome ].containsKey(attribute)) { - return params.genomes[ params.genome ][ attribute ] + if (params.genomes && params.host_genome && params.genomes.containsKey(params.host_genome)) { + if (params.genomes[params.host_genome].containsKey(attribute)) { + return params.genomes[params.host_genome][attribute] } } return null @@ -187,12 +355,8 @@ def getGenomeAttribute(attribute) { // Exit pipeline if incorrect --genome key provided // def genomeExistsError() { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - " Currently, the available genome keys are:\n" + - " ${params.genomes.keySet().join(", ")}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + if (params.genomes && params.host_genome && !params.genomes.containsKey(params.genome)) { + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.host_genome}' not found in any config files provided to the pipeline.\n" + " Currently, the available genome keys are:\n" + " ${params.host_genomes.keySet().join(", ")}\n" + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" error(error_string) } } @@ -204,11 +368,11 @@ def toolCitationText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() return citation_text } @@ -218,9 +382,9 @@ def toolBibliographyText() { // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", // Uncomment function in methodsDescriptionText to render in MultiQC report def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() return reference_text } @@ -242,7 +406,10 @@ def methodsDescriptionText(mqc_methods_yaml) { temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " } meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) - } else meta["doi_text"] = "" + } + else { + meta["doi_text"] = "" + } meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references @@ -256,9 +423,8 @@ def methodsDescriptionText(mqc_methods_yaml) { def methods_text = mqc_methods_yaml.text - def engine = new groovy.text.SimpleTemplateEngine() + def engine = new groovy.text.SimpleTemplateEngine() def description_html = engine.createTemplate(methods_text).make(meta) return description_html.toString() } - diff --git a/subworkflows/local/virus_identification.nf b/subworkflows/local/virus_identification.nf new file mode 100644 index 00000000..4a3a2dac --- /dev/null +++ b/subworkflows/local/virus_identification.nf @@ -0,0 +1,30 @@ +/* + * geNomad: Identification of mobile genetic elements + */ + +include { GENOMAD_DOWNLOAD } from '../../modules/nf-core/genomad/download/main' +include { GENOMAD_ENDTOEND } from '../../modules/nf-core/genomad/endtoend/main' + +workflow VIRUS_IDENTIFICATION { + take: + ch_assemblies // [ [ meta] , fasta ], input scaffolds (mandatory) + ch_genomad_db // [ db ], presupplied geNomad database (optional) + + main: + ch_versions = Channel.empty() + + if ( params.genomad_db ) { + ch_db_for_genomad = ch_genomad_db + } else { + ch_db_for_genomad = GENOMAD_DOWNLOAD( ).genomad_db + ch_versions.mix( GENOMAD_DOWNLOAD.out.versions ) + } + + ch_identified_viruses = GENOMAD_ENDTOEND ( ch_assemblies, ch_db_for_genomad ).virus_fasta + ch_versions.mix( GENOMAD_ENDTOEND.out.versions ) + + emit: + identified_viruses = ch_identified_viruses + versions = ch_versions + +} diff --git a/subworkflows/nf-core/fasta_binning_concoct/main.nf b/subworkflows/nf-core/fasta_binning_concoct/main.nf new file mode 100644 index 00000000..ee51044c --- /dev/null +++ b/subworkflows/nf-core/fasta_binning_concoct/main.nf @@ -0,0 +1,55 @@ +include { CONCOCT_CUTUPFASTA } from '../../../modules/nf-core/concoct/cutupfasta/main.nf' +include { CONCOCT_CONCOCTCOVERAGETABLE } from '../../../modules/nf-core/concoct/concoctcoveragetable/main.nf' +include { CONCOCT_CONCOCT } from '../../../modules/nf-core/concoct/concoct/main.nf' +include { CONCOCT_MERGECUTUPCLUSTERING } from '../../../modules/nf-core/concoct/mergecutupclustering/main.nf' +include { CONCOCT_EXTRACTFASTABINS } from '../../../modules/nf-core/concoct/extractfastabins/main.nf' + +workflow FASTA_BINNING_CONCOCT { + + take: + ch_fasta // channel (mandatory): [ val(meta), [ fasta ] ] (raw contigs from assembly) + ch_bam // channel (mandatory): [ val(meta), [ bam ], [bai]] (bam files of original FASTQ Files mapped back to each contig. meta must correspond to ch_fasta) + + main: + ch_versions = Channel.empty() + + // required to create bedfile due to coverage table + produce_bedfile = true + + CONCOCT_CUTUPFASTA ( ch_fasta, produce_bedfile ) + ch_versions = ch_versions.mix(CONCOCT_CUTUPFASTA.out.versions.first()) + + ch_cutupfasta_for_concoctcoveragetable = CONCOCT_CUTUPFASTA.out.bed + .join( ch_bam, failOnMismatch: true ) + + CONCOCT_CONCOCTCOVERAGETABLE ( ch_cutupfasta_for_concoctcoveragetable ) + ch_versions = ch_versions.mix(CONCOCT_CONCOCTCOVERAGETABLE.out.versions.first()) + + ch_concoctcoveragetable_for_concoctconcoct = CONCOCT_CONCOCTCOVERAGETABLE.out.tsv + .join(CONCOCT_CUTUPFASTA.out.fasta, failOnMismatch: true) + + CONCOCT_CONCOCT( ch_concoctcoveragetable_for_concoctconcoct ) + ch_versions = ch_versions.mix(CONCOCT_CONCOCT.out.versions.first()) + + CONCOCT_MERGECUTUPCLUSTERING ( CONCOCT_CONCOCT.out.clustering_csv ) + ch_versions = ch_versions.mix( CONCOCT_MERGECUTUPCLUSTERING.out.versions.first()) + + ch_mergecutupclustering_for_extractfastabins = ch_fasta + .join(CONCOCT_MERGECUTUPCLUSTERING.out.csv, failOnMismatch: false) + + CONCOCT_EXTRACTFASTABINS ( ch_mergecutupclustering_for_extractfastabins ) + ch_versions = ch_versions.mix(CONCOCT_EXTRACTFASTABINS.out.versions.first()) + + emit: + coverage_table = CONCOCT_CONCOCTCOVERAGETABLE.out.tsv // channel: [ val(meta), [ tsv ] ] + + original_csv = CONCOCT_CONCOCT.out.original_data_csv // channel: [ val(meta), [ csv ] ] + raw_clustering_csv = CONCOCT_CONCOCT.out.clustering_csv // channel: [ val(meta), [ csv ] ] + pca_original = CONCOCT_CONCOCT.out.pca_components_csv // channel: [ val(meta), [ csv ] ] + pca_transformed = CONCOCT_CONCOCT.out.pca_transformed_csv // channel: [ val(meta), [ csv ] ] + + cluster_table = CONCOCT_MERGECUTUPCLUSTERING.out.csv // channel: [ val(meta), [ csv ] ] + bins = CONCOCT_EXTRACTFASTABINS.out.fasta // channel: [ val(meta), [ fasta ] ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fasta_binning_concoct/meta.yml b/subworkflows/nf-core/fasta_binning_concoct/meta.yml new file mode 100644 index 00000000..8ac10160 --- /dev/null +++ b/subworkflows/nf-core/fasta_binning_concoct/meta.yml @@ -0,0 +1,76 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "fasta_binning_concoct" +description: Runs the CONCOCT workflow of contig binning +keywords: + - concoct + - binning + - metagenomics + - contigs +components: + - concoct/cutupfasta + - concoct/concoctcoveragetable + - concoct/concoct + - concoct/mergecutupclustering + - concoct/extractfastabins +input: + - ch_fasta: + type: file + description: | + Structure: [ val(meta), path(fasta)] + File containing raw assembled contigs in FASTA format. + - ch_bam: + type: file + description: | + Structure: [ val(meta), path(bam), path(bai)] + BAM and associated index files file representing reads mapped against each + contig in ch_fasta. Meta must be identical between ch_fasta and ch_bam entries. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - coverage_table: + type: file + description: | + Structure: [ val(meta), path(tsv)] + (Sub)contig coverage table + - original_csv: + type: file + description: | + Structure: [ val(meta), path(csv) ] + Original CONCOCT GT1000 output + - raw_clustering_csv: + type: file + description: | + Structure: [ val(meta), path(csv) ] + CSV containing information which subcontig is assigned to which cluster + - pca_original: + type: file + description: | + Structure: [ val(meta), path(csv) ] + CSV file containing untransformed PCA component values + - pca_transformed: + type: file + description: | + Structure: [ val(meta), path(csv) ] + CSV file transformed PCA component values + - cluster_table: + type: file + description: | + Structure: [ val(meta), path(csv) ] + CSV file containing final cluster assignments of original input contigs + - bin: + type: file + description: | + Structure: [ val(meta), path(fasta) ] + FASTA files containing CONCOCT predicted bin clusters, named numerically + by CONCOCT cluster ID + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test b/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test new file mode 100644 index 00000000..ab5eb230 --- /dev/null +++ b/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test @@ -0,0 +1,46 @@ +nextflow_workflow { + + name "Test Subworkflow FASTA_BINNING_CONCOCT" + script "../main.nf" + workflow "FASTA_BINNING_CONCOCT" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fasta_binning_concoct" + + tag "concoct" + tag "concoct/cutupfasta" + tag "concoct/concoctcoveragetable" + tag "concoct/concoct" + tag "concoct/mergecutupclustering" + tag "concoct/extractfastabins" + + + test("sarscov2 - genome - fasta") { + + when { + workflow { + """ + input[0] = Channel.of( + [[ id: 'test' ], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)], + [[ id: 'test2'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + ) + + input[1] = Channel.of( + [[ id: 'test' ], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true)], + [[ id: 'test2'], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true)] + ) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} diff --git a/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test.snap b/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test.snap new file mode 100644 index 00000000..1e626dcf --- /dev/null +++ b/subworkflows/nf-core/fasta_binning_concoct/tests/main.nf.test.snap @@ -0,0 +1,223 @@ +{ + "sarscov2 - genome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test2" + }, + "test2.tsv:md5,3e0e31b009b3f4b4345df615a70a2835" + ], + [ + { + "id": "test" + }, + "test.tsv:md5,12b125c66c7e353fe206203d203f90be" + ] + ], + "1": [ + [ + { + "id": "test2" + }, + "test2_original_data_gt1000.csv:md5,afe4845db494e8c3d74c11950056c6b9" + ], + [ + { + "id": "test" + }, + "test_original_data_gt1000.csv:md5,5800a76203ca027b87ab14d323958ce2" + ] + ], + "2": [ + [ + { + "id": "test2" + }, + "test2_clustering_gt1000.csv:md5,8cb3e6901075bf07966d08e1816762ce" + ], + [ + { + "id": "test" + }, + "test_clustering_gt1000.csv:md5,8cb3e6901075bf07966d08e1816762ce" + ] + ], + "3": [ + [ + { + "id": "test2" + }, + "test2_PCA_components_data_gt1000.csv:md5,e935179f138edd7f4db6c8cd1fd90d48" + ], + [ + { + "id": "test" + }, + "test_PCA_components_data_gt1000.csv:md5,b4cae93ee69a00b366ab8d7c4e0d6191" + ] + ], + "4": [ + [ + { + "id": "test2" + }, + "test2_PCA_transformed_data_gt1000.csv:md5,7e39ef9d66adb948e75faa1c9a4d1542" + ], + [ + { + "id": "test" + }, + "test_PCA_transformed_data_gt1000.csv:md5,107ff5473b8b0479ede9043fc425e5ea" + ] + ], + "5": [ + [ + { + "id": "test2" + }, + "test2.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ], + [ + { + "id": "test" + }, + "test.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ] + ], + "6": [ + [ + { + "id": "test2" + }, + "test2_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ], + [ + { + "id": "test" + }, + "test_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "7": [ + "versions.yml:md5,02b573fc121beb734320d0e115ae4457", + "versions.yml:md5,4bc65b076c9549a3e935fbc16d7d33fe", + "versions.yml:md5,add0cb757a41623b081a19fad811b97e", + "versions.yml:md5,c8571a624e787edcad507c36c9dab06f", + "versions.yml:md5,f45abcb3924ba6847dea4997c62a7916" + ], + "bins": [ + [ + { + "id": "test2" + }, + "test2_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ], + [ + { + "id": "test" + }, + "test_3.fa.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "cluster_table": [ + [ + { + "id": "test2" + }, + "test2.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ], + [ + { + "id": "test" + }, + "test.csv:md5,ac57fce859cd28f5d18e1f4bbe056a35" + ] + ], + "coverage_table": [ + [ + { + "id": "test2" + }, + "test2.tsv:md5,3e0e31b009b3f4b4345df615a70a2835" + ], + [ + { + "id": "test" + }, + "test.tsv:md5,12b125c66c7e353fe206203d203f90be" + ] + ], + "original_csv": [ + [ + { + "id": "test2" + }, + "test2_original_data_gt1000.csv:md5,afe4845db494e8c3d74c11950056c6b9" + ], + [ + { + "id": "test" + }, + "test_original_data_gt1000.csv:md5,5800a76203ca027b87ab14d323958ce2" + ] + ], + "pca_original": [ + [ + { + "id": "test2" + }, + "test2_PCA_components_data_gt1000.csv:md5,e935179f138edd7f4db6c8cd1fd90d48" + ], + [ + { + "id": "test" + }, + "test_PCA_components_data_gt1000.csv:md5,b4cae93ee69a00b366ab8d7c4e0d6191" + ] + ], + "pca_transformed": [ + [ + { + "id": "test2" + }, + "test2_PCA_transformed_data_gt1000.csv:md5,7e39ef9d66adb948e75faa1c9a4d1542" + ], + [ + { + "id": "test" + }, + "test_PCA_transformed_data_gt1000.csv:md5,107ff5473b8b0479ede9043fc425e5ea" + ] + ], + "raw_clustering_csv": [ + [ + { + "id": "test2" + }, + "test2_clustering_gt1000.csv:md5,8cb3e6901075bf07966d08e1816762ce" + ], + [ + { + "id": "test" + }, + "test_clustering_gt1000.csv:md5,8cb3e6901075bf07966d08e1816762ce" + ] + ], + "versions": [ + "versions.yml:md5,02b573fc121beb734320d0e115ae4457", + "versions.yml:md5,4bc65b076c9549a3e935fbc16d7d33fe", + "versions.yml:md5,add0cb757a41623b081a19fad811b97e", + "versions.yml:md5,c8571a624e787edcad507c36c9dab06f", + "versions.yml:md5,f45abcb3924ba6847dea4997c62a7916" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-25T18:40:35.774932002" + } +} \ No newline at end of file diff --git a/workflows/mag.nf b/workflows/mag.nf index 2734cf87..de353a40 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -3,35 +3,754 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_mag_pipeline' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_mag_pipeline' -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// +include { BINNING_PREPARATION } from '../subworkflows/local/binning_preparation' +include { BINNING } from '../subworkflows/local/binning' +include { BIN_QC } from '../subworkflows/local/bin_qc' +include { BINNING_REFINEMENT } from '../subworkflows/local/binning_refinement' +include { VIRUS_IDENTIFICATION } from '../subworkflows/local/virus_identification' +include { GTDBTK } from '../subworkflows/local/gtdbtk' +include { ANCIENT_DNA_ASSEMBLY_VALIDATION } from '../subworkflows/local/ancient_dna' +include { DOMAIN_CLASSIFICATION } from '../subworkflows/local/domain_classification' +include { DEPTHS } from '../subworkflows/local/depths' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' -workflow MAG { +// +// MODULE: Installed directly from nf-core/modules +// +include { UNTAR as CENTRIFUGEDB_UNTAR } from '../modules/nf-core/untar/main' +include { CENTRIFUGE_CENTRIFUGE } from '../modules/nf-core/centrifuge/centrifuge/main' +include { CENTRIFUGE_KREPORT } from '../modules/nf-core/centrifuge/kreport/main' +include { KRONA_KRONADB } from '../modules/nf-core/krona/kronadb/main' +include { KRONA_KTIMPORTTAXONOMY } from '../modules/nf-core/krona/ktimporttaxonomy/main' +include { KRAKENTOOLS_KREPORT2KRONA as KREPORT2KRONA_CENTRIFUGE } from '../modules/nf-core/krakentools/kreport2krona/main' +include { MEGAHIT } from '../modules/nf-core/megahit/main' +include { SPADES as METASPADES } from '../modules/nf-core/spades/main' +include { SPADES as METASPADESHYBRID } from '../modules/nf-core/spades/main' +include { GUNZIP as GUNZIP_ASSEMBLIES } from '../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_ASSEMBLYINPUT } from '../modules/nf-core/gunzip' +include { PRODIGAL } from '../modules/nf-core/prodigal/main' +include { PROKKA } from '../modules/nf-core/prokka/main' +include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' +include { METAEUK_EASYPREDICT } from '../modules/nf-core/metaeuk/easypredict/main' + +// +// MODULE: Local to the pipeline +// +include { KRAKEN2_DB_PREPARATION } from '../modules/local/kraken2_db_preparation' +include { KRAKEN2 } from '../modules/local/kraken2' +include { POOL_SINGLE_READS as POOL_SHORT_SINGLE_READS } from '../modules/local/pool_single_reads' +include { POOL_PAIRED_READS } from '../modules/local/pool_paired_reads' +include { POOL_SINGLE_READS as POOL_LONG_READS } from '../modules/local/pool_single_reads' +include { QUAST } from '../modules/local/quast' +include { QUAST_BINS } from '../modules/local/quast_bins' +include { QUAST_BINS_SUMMARY } from '../modules/local/quast_bins_summary' +include { CAT_DB } from '../modules/local/cat_db' +include { CAT_DB_GENERATE } from '../modules/local/cat_db_generate' +include { CAT } from '../modules/local/cat' +include { CAT_SUMMARY } from '../modules/local/cat_summary' +include { BIN_SUMMARY } from '../modules/local/bin_summary' +include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules/local/combine_tsv' +workflow MAG { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_raw_short_reads // channel: samplesheet read in from --input + ch_raw_long_reads + ch_input_assemblies + main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet + + //////////////////////////////////////////////////// + /* -- Create channel for reference databases -- */ + //////////////////////////////////////////////////// + + if (params.host_genome) { + host_fasta = params.genomes[params.host_genome].fasta ?: false + ch_host_fasta = Channel.value(file("${host_fasta}")) + host_bowtie2index = params.genomes[params.host_genome].bowtie2 ?: false + ch_host_bowtie2index = Channel.value(file("${host_bowtie2index}/*")) + } + else if (params.host_fasta) { + ch_host_fasta = Channel.value(file("${params.host_fasta}")) + } + else { + ch_host_fasta = Channel.empty() + } + + if (params.kraken2_db) { + ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) + } + else { + ch_kraken2_db_file = [] + } + + if (params.cat_db) { + ch_cat_db_file = Channel.value(file("${params.cat_db}")) + } + else { + ch_cat_db_file = Channel.empty() + } + + if (params.krona_db) { + ch_krona_db_file = Channel.value(file("${params.krona_db}")) + } + else { + ch_krona_db_file = Channel.empty() + } + + if (!params.keep_phix) { + ch_phix_db_file = Channel.value(file("${params.phix_reference}")) + } + + if (!params.keep_lambda) { + ch_lambda_db = Channel.value(file( "${params.lambda_reference}" )) + } else { + ch_lambda_db = Channel.empty() + } + + if (params.genomad_db) { + ch_genomad_db = file(params.genomad_db, checkIfExists: true) + } + else { + ch_genomad_db = Channel.empty() + } + + gtdb = params.skip_binqc || params.skip_gtdbtk ? false : params.gtdb_db + + if (gtdb) { + gtdb = file("${gtdb}", checkIfExists: true) + gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] + } + else { + gtdb = [] + } + + if (params.metaeuk_db && !params.skip_metaeuk) { + ch_metaeuk_db = Channel.value(file("${params.metaeuk_db}", checkIfExists: true)) + } + else { + ch_metaeuk_db = Channel.empty() + } + + // Get mmseqs db for MetaEuk if requested + if (!params.skip_metaeuk && params.metaeuk_mmseqs_db) { + MMSEQS_DATABASES(params.metaeuk_mmseqs_db) + ch_metaeuk_db = MMSEQS_DATABASES.out.database + ch_versions = ch_versions.mix(MMSEQS_DATABASES.out.versions) + } + + /* + ================================================================================ + Preprocessing and QC for short reads + ================================================================================ + */ + + if (!params.assembly_input) { + SHORTREAD_PREPROCESSING( + ch_raw_short_reads, + ch_host_fasta, + ch_phix_db_file, + ch_metaeuk_db + ) + + ch_versions = ch_versions.mix(SHORTREAD_PREPROCESSING.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_PREPROCESSING.out.multiqc_files.collect { it[1] }.ifEmpty([])) + ch_short_reads = SHORTREAD_PREPROCESSING.out.short_reads + ch_short_reads_assembly = SHORTREAD_PREPROCESSING.out.short_reads_assembly + + } + else { + ch_short_reads = ch_raw_short_reads.map { meta, reads -> + def meta_new = meta - meta.subMap('run') + [meta_new, reads] + } + } + + /* + ================================================================================ + Preprocessing and QC for long reads + ================================================================================ + */ + + LONGREAD_PREPROCESSING( + ch_raw_long_reads, + ch_short_reads, + ch_lambda_db + ) + + ch_versions = ch_versions.mix(LONGREAD_PREPROCESSING.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_PREPROCESSING.out.multiqc_files.collect { it[1] }.ifEmpty([])) + ch_long_reads = LONGREAD_PREPROCESSING.out.long_reads + + /* + ================================================================================ + Taxonomic information + ================================================================================ + */ + + // Centrifuge + if (!params.centrifuge_db) { + ch_db_for_centrifuge = Channel.empty() + } + else { + if (file(params.centrifuge_db).isDirectory()) { + ch_db_for_centrifuge = Channel.of(file(params.centrifuge_db, checkIfExists: true)) + } + else { + ch_db_for_centrifuge = CENTRIFUGEDB_UNTAR(Channel.of([[id: 'db'], file(params.centrifuge_db, checkIfExists: true)])).untar.map { it[1] }.first() + ch_versions = ch_versions.mix(CENTRIFUGEDB_UNTAR.out.versions.first()) + } + } + + CENTRIFUGE_CENTRIFUGE( + ch_short_reads, + ch_db_for_centrifuge, + false, + false + ) + ch_versions = ch_versions.mix(CENTRIFUGE_CENTRIFUGE.out.versions.first()) + + CENTRIFUGE_KREPORT(CENTRIFUGE_CENTRIFUGE.out.results, ch_db_for_centrifuge) + ch_versions = ch_versions.mix(CENTRIFUGE_KREPORT.out.versions.first()) + + // Kraken2 + if (!ch_kraken2_db_file.isEmpty()) { + if (ch_kraken2_db_file.extension in ['gz', 'tgz']) { + // Expects to be tar.gz! + ch_db_for_kraken2 = KRAKEN2_DB_PREPARATION(ch_kraken2_db_file).db + } + else if (ch_kraken2_db_file.isDirectory()) { + ch_db_for_kraken2 = Channel + .fromPath("${ch_kraken2_db_file}/*.k2d") + .collect() + .map { file -> + if (file.size() >= 3) { + def db_name = file[0].getParent().getName() + [db_name, file] + } + else { + error("Kraken2 requires '{hash,opts,taxo}.k2d' files.") + } + } + } + else { + ch_db_for_kraken2 = Channel.empty() + } + } + else { + ch_db_for_kraken2 = Channel.empty() + } + + KRAKEN2( + ch_short_reads, + ch_db_for_kraken2 ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix(KRAKEN2.out.versions.first()) + + if ((params.centrifuge_db || params.kraken2_db) && !params.skip_krona) { + if (params.krona_db) { + ch_krona_db = ch_krona_db_file + } + else { + KRONA_KRONADB() + ch_krona_db = KRONA_KRONADB.out.db + ch_versions = ch_versions.mix(KRONA_KRONADB.out.versions) + } + + if (params.centrifuge_db) { + ch_centrifuge_for_krona = KREPORT2KRONA_CENTRIFUGE(CENTRIFUGE_KREPORT.out.kreport).txt.map { meta, files -> ['centrifuge', meta, files] } + ch_versions = ch_versions.mix(KREPORT2KRONA_CENTRIFUGE.out.versions.first()) + } + else { + ch_centrifuge_for_krona = Channel.empty() + } + + // Join together for Krona + ch_tax_classifications = ch_centrifuge_for_krona + .mix(KRAKEN2.out.results_for_krona) + .map { classifier, meta, report -> + def meta_new = meta + [classifier: classifier] + [meta_new, report] + } + + KRONA_KTIMPORTTAXONOMY( + ch_tax_classifications, + ch_krona_db + ) + ch_versions = ch_versions.mix(KRONA_KTIMPORTTAXONOMY.out.versions.first()) + } + + /* + ================================================================================ + Assembly + ================================================================================ + */ + + if (!params.assembly_input) { + + // Co-assembly preparation: grouping for MEGAHIT and for pooling for SPAdes + if (params.coassemble_group) { + // short reads + // group and set group as new id + ch_short_reads_grouped = ch_short_reads_assembly + .map { meta, reads -> [meta.group, meta, reads] } + .groupTuple(by: 0) + .map { group, metas, reads -> + def assemble_as_single = params.single_end || (params.bbnorm && params.coassemble_group) + def meta = [:] + meta.id = "group-${group}" + meta.group = group + meta.single_end = assemble_as_single + if (assemble_as_single) { + [meta, reads.collect { it }, []] + } + else { + [meta, reads.collect { it[0] }, reads.collect { it[1] }] + } + } + // long reads + // group and set group as new id + ch_long_reads_grouped = ch_long_reads + .map { meta, reads -> [meta.group, meta, reads] } + .groupTuple(by: 0) + .map { group, metas, reads -> + def meta = [:] + meta.id = "group-${group}" + meta.group = group + [meta, reads.collect { it }] + } + } + else { + ch_short_reads_grouped = ch_short_reads_assembly + .filter { it[0].single_end } + .map { meta, reads -> [meta, [reads], []] } + .mix( + ch_short_reads_assembly.filter { !it[0].single_end }.map { meta, reads -> [meta, [reads[0]], [reads[1]]] } + ) + ch_long_reads_grouped = ch_long_reads + } + + if (!params.skip_spades || !params.skip_spadeshybrid) { + if (params.coassemble_group) { + if (params.bbnorm) { + ch_short_reads_spades = ch_short_reads_grouped.map { [it[0], it[1]] } + } + else { + POOL_SHORT_SINGLE_READS( + ch_short_reads_grouped.filter { it[0].single_end } + ) + POOL_PAIRED_READS( + ch_short_reads_grouped.filter { !it[0].single_end } + ) + ch_short_reads_spades = POOL_SHORT_SINGLE_READS.out.reads.mix(POOL_PAIRED_READS.out.reads) + } + } + else { + ch_short_reads_spades = ch_short_reads_assembly + } + // long reads + if (!params.single_end && !params.skip_spadeshybrid) { + POOL_LONG_READS(ch_long_reads_grouped) + ch_long_reads_spades = POOL_LONG_READS.out.reads + } + else { + ch_long_reads_spades = Channel.empty() + } + } + else { + ch_short_reads_spades = Channel.empty() + ch_long_reads_spades = Channel.empty() + } + + // Assembly + + ch_assembled_contigs = Channel.empty() + + if (!params.single_end && !params.skip_spades) { + METASPADES(ch_short_reads_spades.map { meta, reads -> [meta, reads, [], []] }, [], []) + ch_spades_assemblies = METASPADES.out.scaffolds.map { meta, assembly -> + def meta_new = meta + [assembler: 'SPAdes'] + [meta_new, assembly] + } + ch_assembled_contigs = ch_assembled_contigs.mix(ch_spades_assemblies) + ch_versions = ch_versions.mix(METASPADES.out.versions.first()) + } + + if (!params.single_end && !params.skip_spadeshybrid) { + ch_short_reads_spades_tmp = ch_short_reads_spades.map { meta, reads -> [meta.id, meta, reads] } + + ch_reads_spadeshybrid = ch_long_reads_spades + .map { meta, reads -> [meta.id, meta, reads] } + .combine(ch_short_reads_spades_tmp, by: 0) + .map { id, meta_long, long_reads, meta_short, short_reads -> [meta_short, short_reads, [], long_reads] } + + METASPADESHYBRID(ch_reads_spadeshybrid, [], []) + ch_spadeshybrid_assemblies = METASPADESHYBRID.out.scaffolds.map { meta, assembly -> + def meta_new = meta + [assembler: "SPAdesHybrid"] + [meta_new, assembly] + } + ch_assembled_contigs = ch_assembled_contigs.mix(ch_spadeshybrid_assemblies) + ch_versions = ch_versions.mix(METASPADESHYBRID.out.versions.first()) + } + + if (!params.skip_megahit) { + MEGAHIT(ch_short_reads_grouped) + ch_megahit_assemblies = MEGAHIT.out.contigs.map { meta, assembly -> + def meta_new = meta + [assembler: 'MEGAHIT'] + [meta_new, assembly] + } + ch_assembled_contigs = ch_assembled_contigs.mix(ch_megahit_assemblies) + ch_versions = ch_versions.mix(MEGAHIT.out.versions.first()) + } + + + + GUNZIP_ASSEMBLIES(ch_assembled_contigs) + ch_versions = ch_versions.mix(GUNZIP_ASSEMBLIES.out.versions) + + ch_assemblies = GUNZIP_ASSEMBLIES.out.gunzip + } + else { + ch_assemblies_split = ch_input_assemblies.branch { meta, assembly -> + gzipped: assembly.getExtension() == "gz" + ungzip: true + } + + GUNZIP_ASSEMBLYINPUT(ch_assemblies_split.gzipped) + ch_versions = ch_versions.mix(GUNZIP_ASSEMBLYINPUT.out.versions) + + ch_assemblies = Channel.empty() + ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLYINPUT.out.gunzip) + } + + ch_quast_multiqc = Channel.empty() + if (!params.skip_quast) { + QUAST(ch_assemblies) + ch_versions = ch_versions.mix(QUAST.out.versions.first()) + } + + /* + ================================================================================ + Predict proteins + ================================================================================ + */ + + if (!params.skip_prodigal) { + PRODIGAL( + ch_assemblies, + 'gff' + ) + ch_versions = ch_versions.mix(PRODIGAL.out.versions.first()) + } + + /* + ================================================================================ + Virus identification + ================================================================================ + */ + + if (params.run_virus_identification) { + VIRUS_IDENTIFICATION(ch_assemblies, ch_genomad_db) + ch_versions = ch_versions.mix(VIRUS_IDENTIFICATION.out.versions.first()) + } + + /* + ================================================================================ + Binning preparation + ================================================================================ + */ + + ch_bin_qc_summary = Channel.empty() + + if (!params.skip_binning || params.ancient_dna) { + BINNING_PREPARATION( + ch_assemblies, + ch_short_reads + ) + ch_versions = ch_versions.mix(BINNING_PREPARATION.out.bowtie2_version.first()) + } + + /* + ================================================================================ + Ancient DNA + ================================================================================ + */ + + if (params.ancient_dna) { + ANCIENT_DNA_ASSEMBLY_VALIDATION(BINNING_PREPARATION.out.grouped_mappings) + ch_versions = ch_versions.mix(ANCIENT_DNA_ASSEMBLY_VALIDATION.out.versions.first()) + } + + /* + ================================================================================ + Binning + ================================================================================ + */ + + if (!params.skip_binning) { + + // Make sure if running aDNA subworkflow to use the damage-corrected contigs for higher accuracy + if (params.ancient_dna && !params.skip_ancient_damagecorrection) { + BINNING( + BINNING_PREPARATION.out.grouped_mappings.join(ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled).map { it -> [it[0], it[4], it[2], it[3]] }, + ch_short_reads + ) + } + else { + BINNING( + BINNING_PREPARATION.out.grouped_mappings, + ch_short_reads + ) + } + ch_versions = ch_versions.mix(BINNING.out.versions) + + if (params.bin_domain_classification) { + + // Make sure if running aDNA subworkflow to use the damage-corrected contigs for higher accuracy + if (params.ancient_dna && !params.skip_ancient_damagecorrection) { + ch_assemblies_for_domainclassification = ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled + } + else { + ch_assemblies_for_domainclassification = ch_assemblies + } + + DOMAIN_CLASSIFICATION(ch_assemblies_for_domainclassification, BINNING.out.bins, BINNING.out.unbinned) + ch_binning_results_bins = DOMAIN_CLASSIFICATION.out.classified_bins + ch_binning_results_unbins = DOMAIN_CLASSIFICATION.out.classified_unbins + ch_versions = ch_versions.mix(DOMAIN_CLASSIFICATION.out.versions) + } + else { + ch_binning_results_bins = BINNING.out.bins.map { meta, bins -> + def meta_new = meta + [domain: 'unclassified'] + [meta_new, bins] + } + ch_binning_results_unbins = BINNING.out.unbinned.map { meta, bins -> + def meta_new = meta + [domain: 'unclassified'] + [meta_new, bins] + } + } + + /* + * DAS Tool: binning refinement + */ + + ch_binning_results_bins = ch_binning_results_bins.map { meta, bins -> + def meta_new = meta + [refinement: 'unrefined'] + [meta_new, bins] + } + + ch_binning_results_unbins = ch_binning_results_unbins.map { meta, bins -> + def meta_new = meta + [refinement: 'unrefined_unbinned'] + [meta_new, bins] + } + + // If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one + if (params.refine_bins_dastool) { + ch_prokarya_bins_dastool = ch_binning_results_bins.filter { meta, bins -> + meta.domain != "eukarya" + } + + ch_eukarya_bins_dastool = ch_binning_results_bins.filter { meta, bins -> + meta.domain == "eukarya" + } + + if (params.ancient_dna) { + ch_contigs_for_binrefinement = ANCIENT_DNA_ASSEMBLY_VALIDATION.out.contigs_recalled + } + else { + ch_contigs_for_binrefinement = BINNING_PREPARATION.out.grouped_mappings.map { meta, contigs, bam, bai -> [meta, contigs] } + } + + BINNING_REFINEMENT(ch_contigs_for_binrefinement, ch_prokarya_bins_dastool) + // ch_refined_bins = ch_eukarya_bins_dastool + // .map{ meta, bins -> + // def meta_new = meta + [refinement: 'eukaryote_unrefined'] + // [meta_new, bins] + // }.mix( BINNING_REFINEMENT.out.refined_bins) + + ch_refined_bins = BINNING_REFINEMENT.out.refined_bins + ch_refined_unbins = BINNING_REFINEMENT.out.refined_unbins + ch_versions = ch_versions.mix(BINNING_REFINEMENT.out.versions) + + if (params.postbinning_input == 'raw_bins_only') { + ch_input_for_postbinning_bins = ch_binning_results_bins + ch_input_for_postbinning_bins_unbins = ch_binning_results_bins.mix(ch_binning_results_unbins) + } + else if (params.postbinning_input == 'refined_bins_only') { + ch_input_for_postbinning_bins = ch_refined_bins + ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins) + } + else if (params.postbinning_input == 'both') { + ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins) + ch_input_for_postbinning_bins = ch_all_bins + ch_input_for_postbinning_bins_unbins = ch_all_bins.mix(ch_binning_results_unbins).mix(ch_refined_unbins) + } + } + else { + ch_input_for_postbinning_bins = ch_binning_results_bins + ch_input_for_postbinning_bins_unbins = ch_binning_results_bins.mix(ch_binning_results_unbins) + } + + ch_input_for_postbinning = params.exclude_unbins_from_postbinning + ? ch_input_for_postbinning_bins + : ch_input_for_postbinning_bins_unbins + + DEPTHS(ch_input_for_postbinning, BINNING.out.metabat2depths, ch_short_reads) + ch_input_for_binsummary = DEPTHS.out.depths_summary + ch_versions = ch_versions.mix(DEPTHS.out.versions) + + /* + * Bin QC subworkflows: for checking bin completeness with either BUSCO, CHECKM, CHECKM2, and/or GUNC + */ + + if (!params.skip_binqc) { + BIN_QC(ch_input_for_postbinning) + + ch_bin_qc_summary = BIN_QC.out.qc_summary + ch_versions = ch_versions.mix(BIN_QC.out.versions) + } + + ch_quast_bins_summary = Channel.empty() + if (!params.skip_quast) { + ch_input_for_quast_bins = ch_input_for_postbinning + .groupTuple() + .map { meta, bins -> + def new_bins = bins.flatten() + [meta, new_bins] + } + + QUAST_BINS(ch_input_for_quast_bins) + ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first()) + ch_quast_bin_summary = QUAST_BINS.out.quast_bin_summaries.collectFile(keepHeader: true) { meta, summary -> + ["${meta.id}.tsv", summary] + } + QUAST_BINS_SUMMARY(ch_quast_bin_summary.collect()) + ch_quast_bins_summary = QUAST_BINS_SUMMARY.out.summary + } + + /* + * CAT: Bin Annotation Tool (BAT) are pipelines for the taxonomic classification of long DNA sequences and metagenome assembled genomes (MAGs/bins) + */ + ch_cat_db = Channel.empty() + if (params.cat_db) { + CAT_DB(ch_cat_db_file) + ch_cat_db = CAT_DB.out.db + } + else if (params.cat_db_generate) { + CAT_DB_GENERATE() + ch_cat_db = CAT_DB_GENERATE.out.db + } + CAT( + ch_input_for_postbinning, + ch_cat_db + ) + // Group all classification results for each sample in a single file + ch_cat_summary = CAT.out.tax_classification_names.collectFile(keepHeader: true) { meta, classification -> + ["${meta.id}.txt", classification] + } + // Group all classification results for the whole run in a single file + CAT_SUMMARY( + ch_cat_summary.collect() + ) + ch_versions = ch_versions.mix(CAT.out.versions.first()) + ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions) + + // If CAT is not run, then the CAT global summary should be an empty channel + if (params.cat_db_generate || params.cat_db) { + ch_cat_global_summary = CAT_SUMMARY.out.combined + } + else { + ch_cat_global_summary = Channel.empty() + } + + /* + * GTDB-tk: taxonomic classifications using GTDB reference + */ + + if (!params.skip_gtdbtk) { + + ch_gtdbtk_summary = Channel.empty() + if (gtdb) { + + ch_gtdb_bins = ch_input_for_postbinning.filter { meta, bins -> + meta.domain != "eukarya" + } + + GTDBTK( + ch_gtdb_bins, + ch_bin_qc_summary, + gtdb, + gtdb_mash + ) + ch_versions = ch_versions.mix(GTDBTK.out.versions.first()) + ch_gtdbtk_summary = GTDBTK.out.summary + } + } + else { + ch_gtdbtk_summary = Channel.empty() + } + + if ((!params.skip_binqc) || !params.skip_quast || !params.skip_gtdbtk) { + BIN_SUMMARY( + ch_input_for_binsummary, + ch_bin_qc_summary.ifEmpty([]), + ch_quast_bins_summary.ifEmpty([]), + ch_gtdbtk_summary.ifEmpty([]), + ch_cat_global_summary.ifEmpty([]), + params.binqc_tool + ) + } + + /* + * Prokka: Genome annotation + */ + + if (!params.skip_prokka) { + ch_bins_for_prokka = ch_input_for_postbinning + .transpose() + .map { meta, bin -> + def meta_new = meta + [id: bin.getBaseName()] + [meta_new, bin] + } + .filter { meta, bin -> + meta.domain != "eukarya" + } + + PROKKA( + ch_bins_for_prokka, + [], + [] + ) + ch_versions = ch_versions.mix(PROKKA.out.versions.first()) + } + + if (!params.skip_metaeuk && (params.metaeuk_db || params.metaeuk_mmseqs_db)) { + ch_bins_for_metaeuk = ch_input_for_postbinning + .transpose() + .filter { meta, bin -> + meta.domain in ["eukarya", "unclassified"] + } + .map { meta, bin -> + def meta_new = meta + [id: bin.getBaseName()] + [meta_new, bin] + } + + METAEUK_EASYPREDICT(ch_bins_for_metaeuk, ch_metaeuk_db) + ch_versions = ch_versions.mix(METAEUK_EASYPREDICT.out.versions) + } + } // // Collate and save software versions @@ -42,31 +761,38 @@ workflow MAG { name: 'nf_core_' + 'mag_software_' + 'mqc_' + 'versions.yml', sort: true, newLine: true - ).set { ch_collated_versions } + ) + .set { ch_collated_versions } // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") + ch_multiqc_config = Channel.fromPath( + "${projectDir}/assets/multiqc_config.yml", + checkIfExists: true + ) + ch_multiqc_custom_config = params.multiqc_config + ? Channel.fromPath(params.multiqc_config, checkIfExists: true) + : Channel.empty() + ch_multiqc_logo = params.multiqc_logo + ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) + : Channel.fromPath("${workflow.projectDir}/docs/images/mag_logo_mascot_light.png", checkIfExists: true) + + summary_params = paramsSummaryMap( + workflow, + parameters_schema: "nextflow_schema.json" + ) ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') + ) + ch_multiqc_custom_methods_description = params.multiqc_methods_description + ? file(params.multiqc_methods_description, checkIfExists: true) + : file("${projectDir}/assets/methods_description_template.yml", checkIfExists: true) + ch_methods_description = Channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description) + ) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) ch_multiqc_files = ch_multiqc_files.mix( @@ -76,7 +802,31 @@ workflow MAG { ) ) - MULTIQC ( + ch_multiqc_files = ch_multiqc_files.mix(CENTRIFUGE_KREPORT.out.kreport.collect { it[1] }.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(KRAKEN2.out.report.collect { it[1] }.ifEmpty([])) + + if (!params.skip_quast) { + ch_multiqc_files = ch_multiqc_files.mix(QUAST.out.report.collect().ifEmpty([])) + + if (!params.skip_binning) { + ch_multiqc_files = ch_multiqc_files.mix(QUAST_BINS.out.dir.collect().ifEmpty([])) + } + } + + if (!params.skip_binning || params.ancient_dna) { + ch_multiqc_files = ch_multiqc_files.mix(BINNING_PREPARATION.out.bowtie2_assembly_multiqc.collect().ifEmpty([])) + } + + if (!params.skip_binning && !params.skip_prokka) { + ch_multiqc_files = ch_multiqc_files.mix(PROKKA.out.txt.collect { it[1] }.ifEmpty([])) + } + + if (!params.skip_binning && !params.skip_binqc && params.binqc_tool == 'busco') { + ch_multiqc_files = ch_multiqc_files.mix(BIN_QC.out.multiqc_files.collect().ifEmpty([])) + } + + + MULTIQC( ch_multiqc_files.collect(), ch_multiqc_config.toList(), ch_multiqc_custom_config.toList(), @@ -85,13 +835,7 @@ workflow MAG { [] ) - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/