nf-core · CarsonJM · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023 · Nov 20, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -62,6 +62,7 @@ jobs:
             test_adapterremoval,
             test_binrefinement,
             test_virus_identification,
+            test_samplesheet,
           ]
     steps:
       - name: Free some space

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#543](https://github.com/nf-core/mag/pull/543) - Automatic samplesheet generation for nf-core/phageannotator (@CarsonJM)
+
 ### `Changed`
 
 ### `Fixed`

diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ The pipeline then:
 - Performs ancient DNA validation and repair with [pyDamage](https://github.com/maxibor/pydamage) and [freebayes](https://github.com/freebayes/freebayes)
 - optionally refines bins with [DAS Tool](https://github.com/cmks/DAS_Tool)
 - assigns taxonomy to bins using [GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) and/or [CAT](https://github.com/dutilh/CAT) and optionally identifies viruses in assemblies using [geNomad](https://github.com/apcamargo/genomad), or Eukaryotes with [Tiara](https://github.com/ibe-uw/tiara)
+- generates a samplesheet that can be used as input for other nf-core pipelines. Currently, [phageannotator](https://github.com/nf-core/phageannotator) is supported.
 
 Furthermore, the pipeline creates various reports in the results directory specified, including a [MultiQC](https://multiqc.info/) report summarizing some of the findings and software versions.
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -741,6 +741,22 @@ process {
         ]
     }
 
+    withName: MAG_TO_SAMPLESHEET {
+        publishDir = [
+            path: { "${params.outdir}/samplesheet" },
+            enabled: false
+        ]
+    }
+
+    withName: 'MAG_MERGE_SAMPLESHEET' {
+        ext.prefix = "${params.nf_core_pipeline}"
+        publishDir = [
+            path: { "${params.outdir}/samplesheet" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: CUSTOM_DUMPSOFTWAREVERSIONS {
         publishDir = [
             path: { "${params.outdir}/pipeline_info" },

diff --git a/conf/test_samplesheet.config b/conf/test_samplesheet.config
@@ -0,0 +1,47 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/mag -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input                         = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.multirun.csv'
+    centrifuge_db                 = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_cf.tar.gz"
+    kraken2_db                    = "https://raw.githubusercontent.com/nf-core/test-datasets/mag/test_data/minigut_kraken.tgz"
+    skip_krona                    = true
+    min_length_unbinned_contigs   = 1
+    max_unbinned_contigs          = 2
+    busco_db                      = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+    busco_clean                   = true
+    skip_gtdbtk                   = true
+    skip_concoct                  = true
+
+    // For computational efficiency
+    nf_core_pipeline            = 'phageannotator'
+    coassemble_group            = false
+    skip_binning                = true
+    skip_prokka                 = true
+    skip_spadeshybrid           = true
+    skip_quast                  = true
+    skip_prodigal               = true
+    skip_krona                  = true
+    skip_adapter_trimming       = true
+    skip_metabat2               = true
+    skip_maxbin2                = true
+    skip_busco                  = true
+}
diff --git a/docs/output.md b/docs/output.md
@@ -21,6 +21,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes)
 - [Additional summary for binned genomes](#additional-summary-for-binned-genomes)
 - [Ancient DNA](#ancient-dna)
+- [Samplesheet generation](#sampleseet-generation)
 - [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -706,6 +707,20 @@ Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correc
 
 </details>
 
+### Samplesheet generation
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `samplesheet/`
+  - `[nf_core_pipeline].csv`: a samplesheet in CSV format that can be directly used as input for the specified nf-core pipeline
+
+</details>
+
+Currently, samplesheets for the following nf-core pipelines can be automatically generated:
+
+- [phageannotator](https://github.com/nf-core/phageannotator): a pipeline for identifying, annotation, and quantifying phage sequences in (meta)-genomic sequences.
+
 ### MultiQC
 
 <details markdown="1">

diff --git a/modules.json b/modules.json
@@ -36,6 +36,11 @@
                         "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
                         "installed_by": ["modules"]
                     },
+                    "cat/cat": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "cat/fastq": {
                         "branch": "master",
                         "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",

diff --git a/modules/local/mag_merge_samplesheet.nf b/modules/local/mag_merge_samplesheet.nf
@@ -0,0 +1,28 @@
+process MAG_MERGE_SAMPLESHEET {
+
+    conda "conda-forge::sed=4.7"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'nf-core/ubuntu:20.04' }"
+
+    input:
+    path ('samplesheets/*')
+
+    output:
+    path "*_samplesheet.csv", emit: samplesheet
+    path "versions.yml"   , emit: versions
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    head -n 1 `ls ./samplesheets/* | head -n 1` > ${prefix}_samplesheet.csv
+    for fileid in `ls ./samplesheets/*`; do
+        awk 'NR>1' \$fileid >> ${prefix}_samplesheet.csv
+    done
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/mag_to_samplesheet.nf b/modules/local/mag_to_samplesheet.nf
@@ -0,0 +1,40 @@
+process MAG_TO_SAMPLESHEET {
+    tag "$meta.id"
+
+    executor 'local'
+    memory 100.MB
+
+    input:
+    val meta
+    val pipeline
+
+    output:
+    tuple val(meta), path("*samplesheet.csv"), emit: samplesheet
+
+    exec:
+    //
+    // Create samplesheet containing metadata
+    //
+
+    // Add nf-core pipeline specific entries
+    if (pipeline) {
+        if (pipeline == 'phageannotator') {
+            pipeline_map = [
+                sample  : "${meta.id}",
+                group   : "${meta.group}",
+                fastq_1 : meta.fastq_1,
+                fastq_2 : meta.fastq_2,
+                fasta   : meta.fasta
+            ]
+        }
+    }
+
+    // Create a samplesheet
+    samplesheet  = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n'
+    samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",")
+
+    // Write samplesheet to file
+    def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv")
+    samplesheet_file.text = samplesheet
+
+}
diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml