diff --git a/README.md b/README.md index c314bb7..dbd9fbe 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ In folders 00 to 09 you will find soft link(s) to the final result files of each `09_taxonomyAssigned_lca_result_qCov#_id#_diff#`: Intermediate and final taxonomy assignment result files +`10_create_phyloseq`: Phyloseq object created after taxonomy assignment + `work`: Holds all the results, intermediate files, ... `.nextflow`: Nextflow generated folder holding history info @@ -116,8 +118,8 @@ For single-end run: For paired-end run: `nextflow run eDNAFlow.nf --barcode 'pe_bc*' --blast_db 'Path2TestBlastDataset/file.fasta' --custom_db 'path2/customDatabase/myDb' [OPTIONS]` -For running LCA taxonomy assignment script: -`nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" [OPTIONS]` +For running LCA taxonomy assignment and phyloseq scripts: +`nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" --fastaFile path2/zotus.fasta --metadataFile path2/metadata.csv [OPTIONS]` ## Description of run options eDNAFlow allows execution of all or parts of the pipeline as long as the correct file formats are provided. For example, the user may choose to run eDNAFlow on a raw file that hasn't been demultiplexed, or opt to provide an already demultiplexed file. Similarly, a user may have performed the clustering with a different algorithm (e.g. DADA2) and is only interested in using the lca script. @@ -162,6 +164,10 @@ For description of LCA script and required file formats see section below: LCA ( `--blastFile "blastResult_file"`: Provide the blast result file name; +`--fastaFile "zotus.fasta"`: Provide the ZOTU, OTU or ASV fasta file name; + +`--metadataFile "metadata.csv"`: Provide the metadata file name (there must be a `sample_id` column, the file can be .csv or .tsv); + ***Optional*** `--lca_qcov "percent"`: percent of query coverage; Default is 100 @@ -172,6 +178,8 @@ For description of LCA script and required file formats see section below: LCA ( `--lca_output "string"`: Output file name; +`--optimise_tree`: It's a boolean, setting this to true will optimise the phylogenetic tree in the phyloseq object, but it will also significantly increase the running time of the pipeline; + ### Skipping and/or isolating steps @@ -231,7 +239,7 @@ This setting should be adjusted so higher threshold is employed for genetic mark `--mode 'usearch32'`: by default eDNAFlow uses the free version of usearch (i.e. usearch 32 bit version); if you have access to 64bit version it can be set via changing mode as `--mode 'usearch64'`; if you are using the `--skipDemux` option, then you also have the option of using the open source vsearch by setting mode as `--mode 'vsearch'` -`--usearch64 'Path2/Usearch64/executable'`: if mode is set to usearch64, then this option has to be specified; the full path must point to usearch64 executable +`--usearch64 'Path2/Usearch64/executable'`: if mode is set to usearch64, then this option has to be specified; the full path must point to usearch64 executable `--vsearch 'Path2/vsearch/executable'`: if mode is set to vsearch, then this option has to be specified; the full path must point to vsearch executable diff --git a/conf/base.config b/conf/base.config index ca79d98..3cdf0b9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -23,6 +23,7 @@ singularity { withLabel: 'blast' { container = 'docker://ncbi/blast:2.10.1' } withLabel: 'lulu' { container = 'docker://index.docker.io/mahsamousavi/lulu:2019' } withLabel: 'lca_python3' { container = 'docker://python:3.6'} + withLabel: 'phyloseq' { container = 'docker://index.docker.io/adbennett/dada2_and_phyloseq:v0.1' } cache = 'lenient' diff --git a/eDNAFlow.nf b/eDNAFlow.nf index 35fd3a8..fc1ed89 100755 --- a/eDNAFlow.nf +++ b/eDNAFlow.nf @@ -1,121 +1,126 @@ #!/usr/bin/env nextflow def helpMessage() { - log.info""" + log.info""" - Usage: + Usage: - Examples of basic commands to run the pipeline on local machine: + Examples of basic commands to run the pipeline on local machine: - For single-end run: - nextflow run eDNAFlow.nf --reads 'file.fastq' --barcode 'bc_*.txt' --blast_db 'path2/LocalGenbankDatabase/nt' [OPTIONS] + For single-end run: + nextflow run eDNAFlow.nf --reads 'file.fastq' --barcode 'bc_*.txt' --blast_db 'path2/LocalGenbankDatabase/nt' [OPTIONS] - For paired-end run: - nextflow run eDNAFlow.nf --barcode 'pe_bc*' --blast_db 'Path2TestBlastDataset/file.fasta' --custom_db 'path2/customDatabase/myDb' [OPTIONS] + For paired-end run: + nextflow run eDNAFlow.nf --barcode 'pe_bc*' --blast_db 'Path2TestBlastDataset/file.fasta' --custom_db 'path2/customDatabase/myDb' [OPTIONS] - For running LCA taxonomy assignment script - nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" [OPTIONS] + For running LCA taxonomy assignment script + nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" [OPTIONS] - Mandatory arguments if sequences are NOT demultiplexed - --reads [file] Input data name (must be surrounded with quotes); - do NOT specify this option if reads are paired-end as they get identified automatically by default; - reads must be unzipped. The paired-end file name must end with _R1.fastq & _R2.fastq - --barcode [file] Barcode file name; barcode file format must match OBITools requirement; - if multiple barcode files exist (e.g. bc_1.txt; bc_2.txt) it can be specified like this: 'bc*.txt' - - At least one of the below databases must be specified. - --blast_db [dir] Absolute path to local nt databse - --custom_db [dir] Absolute path to custom database + Mandatory arguments if sequences are NOT demultiplexed + --reads [file] Input data name (must be surrounded with quotes); + do NOT specify this option if reads are paired-end as they get identified automatically by default; + reads must be unzipped. The paired-end file name must end with _R1.fastq & _R2.fastq + --barcode [file] Barcode file name; barcode file format must match OBITools requirement; + if multiple barcode files exist (e.g. bc_1.txt; bc_2.txt) it can be specified like this: 'bc*.txt' + + At least one of the below databases must be specified. + --blast_db [dir] Absolute path to local nt databse + --custom_db [dir] Absolute path to custom database - Mandatory arguments if sequences are demultiplexed - --skipDemux [bool] - --demuxedInput [file] A fasta file holding all the demultiplexed sequences; - Format of the sample identifier must match USEARCH requirements + Mandatory arguments if sequences are demultiplexed + --skipDemux [bool] + --demuxedInput [file] A fasta file holding all the demultiplexed sequences; + Format of the sample identifier must match USEARCH requirements - At least one of the below databases must be specified. - --blast_db [dir] Absolute path to local nt databse - --custom_db [dir] Absolute path to custom database - - Mandatory and optional arguments for running LCA taxonomy assignment script - --taxonomyAssignment [bool] - --zotuTable [file] A raw or LULU curated Zotu, OTU, or ASV table file; file must exist in the same directory as eDNAFlow; - --blastFile [file] Blast result file; file must exist in the same directory as eDNAFlow - For file format requirements of --zotuTable & --blastFile check out eDNAFlow GitHub page - - Optional - --lca_qcov [num] Percent of query coverage; Default is ${params.lca_qcov} - --lca_pid [num] Percent of identity; Default is ${params.lca_pid} - --lca_diff [num] The difference (Diff) between % identities of two hits when their qCov is equalDiff; Default is ${params.lca_diff} - --lca_output [string] Output file name; Default is ${params.lca_output} + At least one of the below databases must be specified. + --blast_db [dir] Absolute path to local nt databse + --custom_db [dir] Absolute path to custom database + + Mandatory and optional arguments for running LCA taxonomy assignment script + --taxonomyAssignment [bool] + --zotuTable [file] A raw or LULU curated Zotu, OTU, or ASV table file; file must exist in the same directory as eDNAFlow; + --blastFile [file] Blast result file; file must exist in the same directory as eDNAFlow + For file format requirements of --zotuTable & --blastFile check out eDNAFlow GitHub page + --fastaFile [file] ZOTU, OTU or ASV fasta file; file must exist in the same directory as eDNAFlow + --metadataFile [file] Metadata file with .csv or .tsv extension (there must be a `sample_id` column); file must exist in the same directory as eDNAFlow + + Optional + --lca_qcov [num] Percent of query coverage; Default is ${params.lca_qcov} + --lca_pid [num] Percent of identity; Default is ${params.lca_pid} + --lca_diff [num] The difference (Diff) between % identities of two hits when their qCov is equalDiff; Default is ${params.lca_diff} + --lca_output [string] Output file name; Default is ${params.lca_output} + --optimise_tree [bool] Optimise phylogenetic tree in phyloseq object; Default is ${params.optimise_tree} - Skipping Skip any of the mentioned steps - --skipDemux [bool] If this option is set, then --demuxedInput [file] must be provided - --skipFastqc [bool] - - Parameters to run eDNAFlow on Cloud/HPC - -profile [string] Currently can choose between "nimbus" (can be used if user has access to more memory i.e. cloud or HPC), - "zeus" and "magnus" (it's specific to users who have access to ZEUS/Magnus - high-throughput HPC clusters at the Pawsey Supercomputing Centre) - e.g. -profile nimbus - - --bindDir [dir] If you run eDNAFlow on Cloud or HPC, you will need to specify this option - On HPC, it usually will be /scratch and/or /group. On Cloud, it could be your mounted volume. - e.g. --bindDir "/scratch" - If you need to mount more than one directory put space in between e.g. --bindDir "/scratch /group" + Skipping Skip any of the mentioned steps + --skipDemux [bool] If this option is set, then --demuxedInput [file] must be provided + --skipFastqc [bool] + + Parameters to run eDNAFlow on Cloud/HPC + -profile [string] Currently can choose between "nimbus" (can be used if user has access to more memory i.e. cloud or HPC), + "zeus" and "magnus" (it's specific to users who have access to ZEUS/Magnus - high-throughput HPC clusters at the Pawsey Supercomputing Centre) + e.g. -profile nimbus + + --bindDir [dir] If you run eDNAFlow on Cloud or HPC, you will need to specify this option + On HPC, it usually will be /scratch and/or /group. On Cloud, it could be your mounted volume. + e.g. --bindDir "/scratch" + If you need to mount more than one directory put space in between e.g. --bindDir "/scratch /group" - General optional parameters - --help Show this help message - --publish_dir_mode [string] Choose between symlink , copy, link. Default is ${params.publish_dir_mode} - --singularityDir [dir] Directory where singularity images will be stored + General optional parameters + --help Show this help message + --publish_dir_mode [string] Choose between symlink , copy, link. Default is ${params.publish_dir_mode} + --singularityDir [dir] Directory where singularity images will be stored - Demultiplexing - --onlyDemux [bool] - --primer_mismatch [num] Number of mismatch allowed for matching primers; Default is ${params.primer_mismatch} - - Quality filtering / Merging - --minQuality [num] The minimum Phred quality score to apply for quality control of raw sequences; Default is ${params.minQuality} - --minAlignLeng [num] The minimum alignment length for merging read1 and read2; Default is ${params.minAlignLeng} - --minLen [num] The minimum length allowed for sequences; Default is ${params.minLen} - - ZOTU formation - --minsize [num] The minimum abundance; input sequences with lower abundances are removed; Default is ${params.minsize} - - Blast parameters - --blast_task [string] Blast task to be performed; Default is '${params.blast_task}'; but can be set to 'megablast' if required - --maxTarSeq [num] The maximum number of target sequences for hits per query to be returned by Blast; Default is ${params.maxTarSeq} - --perc_identity [num] Percentage of identical matches; Default is '${params.perc_identity}' - --evalue [num] Expected value for saving blast hits; Default is '${params.evalue}' - --qcov [num] The percent of the query that has to form an alignment against the reference to be retained; - Higher values prevent alignments of only a short portion of the query to a reference; Default is '${params.qcov}' - - Choice of USEARCH32 vs USEARCH64 - --mode [str] Default is '${params.mode}'; for running with 64 version the mode has to be set to --mode 'usearch64' - and below --search64 option has to be specified as well; can set to --mode 'vsearch' (only if --skipDemux is also chosen) - below --vsearch option has to be specified as well - --usearch64 [dir] Full path to where usearch64 bit version is stored locally - --vsearch [dir] Full path to where vsearch bit version is stored locally - - LULU - --lulu [file] An R script to run post-clustering curation with default settings of LULU; - This file has been provided and must be present in the same directory as other scripts; - by default eDNAFlow will be looking for this file in the same directory where eDNAFlow.nf is. - --minMatch_lulu [num] Default is '${params.minMatch_lulu}'; A minimum threshold (minimum_match) of sequence similarity - for considering any OTU as an error of another. - This setting should be adjusted so higher threshold is employed for genetic markers with little variation - and/or few expected PCR and sequencing errors (See LULU paper). - - Other options - --max_memory [str] Memory limit for each step of pipeline. e.g. --max_memory '8.GB'. Default: '${params.max_memory}' - --max_time [str] Time limit for each step of the pipeline. e.g. --max_time '2.h'. Default: '${params.max_time}' - --max_cpus [str] Maximum number of CPUs to use for each step of the pipeline. e.g. --max_cpus '1' Default: '${params.max_cpus}' - """.stripIndent() + Demultiplexing + --onlyDemux [bool] + --primer_mismatch [num] Number of mismatch allowed for matching primers; Default is ${params.primer_mismatch} + + Quality filtering / Merging + --minQuality [num] The minimum Phred quality score to apply for quality control of raw sequences; Default is ${params.minQuality} + --minAlignLeng [num] The minimum alignment length for merging read1 and read2; Default is ${params.minAlignLeng} + --minLen [num] The minimum length allowed for sequences; Default is ${params.minLen} + + ZOTU formation + --minsize [num] The minimum abundance; input sequences with lower abundances are removed; Default is ${params.minsize} + + Blast parameters + --blast_task [string] Blast task to be performed; Default is '${params.blast_task}'; but can be set to 'megablast' if required + --maxTarSeq [num] The maximum number of target sequences for hits per query to be returned by Blast; Default is ${params.maxTarSeq} + --perc_identity [num] Percentage of identical matches; Default is '${params.perc_identity}' + --evalue [num] Expected value for saving blast hits; Default is '${params.evalue}' + --qcov [num] The percent of the query that has to form an alignment against the reference to be retained; + Higher values prevent alignments of only a short portion of the query to a reference; Default is '${params.qcov}' + + Choice of USEARCH32 vs USEARCH64 vs VSEARCH + --mode [str] Default is '${params.mode}'; for running with 64 version the mode has to be set to --mode 'usearch64' + and below --search64 option has to be specified as well; can set to --mode 'vsearch' (only if --skipDemux is also chosen) + below --vsearch option has to be specified as well + --usearch64 [dir] Full path to where usearch64 bit version is stored locally + --vsearch [dir] Full path to where vsearch bit version is stored locally + + LULU + --lulu [file] An R script to run post-clustering curation with default settings of LULU; + This file has been provided and must be present in the same directory as other scripts; + by default eDNAFlow will be looking for this file in the same directory where eDNAFlow.nf is. + --minMatch_lulu [num] Default is '${params.minMatch_lulu}'; A minimum threshold (minimum_match) of sequence similarity + for considering any OTU as an error of another. + This setting should be adjusted so higher threshold is employed for genetic markers with little variation + and/or few expected PCR and sequencing errors (See LULU paper). + + Other options + --max_memory [str] Memory limit for each step of pipeline. e.g. --max_memory '8.GB'. Default: '${params.max_memory}' + --max_time [str] Time limit for each step of the pipeline. e.g. --max_time '2.h'. Default: '${params.max_time}' + --max_cpus [str] Maximum number of CPUs to use for each step of the pipeline. e.g. --max_cpus '1' Default: '${params.max_cpus}' + """.stripIndent() } + // Show help message if (params.help) { - helpMessage() - exit 0 + helpMessage() + exit 0 } + // setting up variables fastQC_input_ch = Channel.fromFilePairs(params.reads, size: -1) @@ -132,11 +137,15 @@ blast_task = params.blast_task evalue = params.evalue qcov = params.qcov lulu = file(params.lulu) +dada2_script = file(params.dada2_script) +cutadapt_script = file(params.cutadapt_script) + mode = params.mode usearch64 = params.usearch64 vsearch = params.vsearch minMatch_lulu = params.minMatch_lulu + // Setting up channels & inputs for taxonomy assignment step params.taxonomyAssignment = false @@ -145,27 +154,29 @@ if (params.taxonomyAssignment) { reads_ch = Channel.empty() fastQC_input_ch = Channel.empty() lca_script = file(params.lca_script) - zotuTable_ch = Channel.fromPath(params.zotuTable) + phyloseq_script = file(params.phyloseq_script) + (zotuTable_ch_a, zotuTable_ch_b) = Channel.fromPath(params.zotuTable).into(2) blastFile_ch = Channel.fromPath(params.blastFile) + fastaFile_ch = Channel.fromPath(params.fastaFile) + metadataFile_ch = Channel.fromPath(params.metadataFile) lca_qcov = params.lca_qcov lca_pid = params.lca_pid lca_diff = params.lca_diff lca_output = params.lca_output } else { - zotuTable_ch = Channel.empty() + zotuTable_ch_a = Channel.empty() + zotuTable_ch_b = Channel.empty() blastFile_ch = Channel.empty() + fastaFile_ch = Channel.empty() + metadataFile_ch = Channel.empty() lca_script = Channel.empty() + phyloseq_script = Channel.empty() } -// Running only demultiplexing or skipping - params.onlyDemux = false - params.skipDemux = false - params.demuxedInput = "*.fasta" - params.skipFastqc = false @@ -176,36 +187,34 @@ if (params.skipDemux) { } - /* * Process 0: FastQC to check the initial quality of raw reads */ process '00_fastQC' { - label 'fastqc' + label 'fastqc' publishDir "00_fastQC_${sample_id}", mode: params.publish_dir_mode input: - tuple val(sample_id), path(read) from fastQC_input_ch + tuple val(sample_id), path(read) from fastQC_input_ch output: - tuple val(sample_id), path('*_fastqc.{zip,html}') into res_fastQC_ch + tuple val(sample_id), path('*_fastqc.{zip,html}') into res_fastQC_ch when: - !params.skipFastqc + !params.skipFastqc script: - - if( read instanceof Path ) { - """ - fastqc -q ${read} - """ - } else { - """ - fastqc -q ${read[0]} ${read[1]} - """ - } + if( read instanceof Path ) { + """ + fastqc -q ${read} + """ + } else { + """ + fastqc -q ${read[0]} ${read[1]} + """ + } } @@ -222,89 +231,85 @@ process '01_a_quality_Filtering' { tuple val(sample_id), path(read) from reads_ch output: - tuple val(sample_id), path('*_QF.fastq') into QF_ch + tuple val(sample_id), path('*_QF.fastq') into QF_ch script: + if( read instanceof Path ) { + """ + AdapterRemoval --threads ${task.cpus} --file1 ${read} \ + --trimns --trimqualities \ + --minquality ${minQuality} \ + --basename ${sample_id} - if( read instanceof Path ) { - """ - AdapterRemoval --threads ${task.cpus} --file1 ${read} \ - --trimns --trimqualities \ - --minquality ${minQuality} \ - --basename ${sample_id} + mv ${sample_id}.truncated ${sample_id}_QF.fastq + """ - mv ${sample_id}.truncated ${sample_id}_QF.fastq + // if reads are paired-end then merge + } else { + """ + AdapterRemoval --threads ${task.cpus} --file1 ${read[0]} --file2 ${read[1]} \ + --collapse --trimns --trimqualities \ + --minquality $minQuality \ + --minalignmentlength $minAlignLeng \ + --basename ${sample_id} - """ - } - -// if reads are paired-end then merge - else { - """ - AdapterRemoval --threads ${task.cpus} --file1 ${read[0]} --file2 ${read[1]} \ - --collapse --trimns --trimqualities \ - --minquality $minQuality \ - --minalignmentlength $minAlignLeng \ - --basename ${sample_id} - - mv ${sample_id}.collapsed ${sample_id}_QF.fastq - """ - } + mv ${sample_id}.collapsed ${sample_id}_QF.fastq + """ + } } + QF_ch.into { QFres_4dmux_ch; QFres_4fastQC_ch} barcode_reads_mixed = QFres_4dmux_ch.combine(bc_ch) - .view() - + .view() process '01_b_fastQC' { - label 'fastqc' + label 'fastqc' publishDir "01_b_fastQC_${sample_id}", mode: params.publish_dir_mode input: - tuple val(sample_id), path("${sample_id}_QF.fastq") from QFres_4fastQC_ch + tuple val(sample_id), path("${sample_id}_QF.fastq") from QFres_4fastQC_ch output: - path("*_fastqc.{zip,html}") into last_fastQC_ch + path("*_fastqc.{zip,html}") into last_fastQC_ch when: - !params.skipFastqc + !params.skipFastqc script: - """ - fastqc -q ${"${sample_id}_QF.fastq"} - """ + """ + fastqc -q ${"${sample_id}_QF.fastq"} + """ } - /* * Process 2: Assigning reads to samples/demultiplexing */ - process '02_assigned_dmux' { label 'obitools' publishDir "02_assigned_dmux_${sample_id}_${barcode.baseName}", mode: params.publish_dir_mode input: - tuple val(sample_id), path(read), path(barcode) from barcode_reads_mixed - + tuple val(sample_id), path(read), path(barcode) from barcode_reads_mixed output: - tuple val(sample_id), val("${barcode.baseName}"), path("*_Dmux.fastq") into dmux_ch + tuple val(sample_id), val("${barcode.baseName}"), path("*_Dmux.fastq") into dmux_ch script: - """ - ngsfilter -t ${barcode} -e ${primer_mismatch} -u "orphan.fastq" ${read} > "${sample_id}_${barcode.baseName}_QF_Dmux.fastq" - """ + """ + ngsfilter -t ${barcode} -e ${primer_mismatch} -u "orphan.fastq" ${read} > "${sample_id}_${barcode.baseName}_QF_Dmux.fastq" + """ } + grouped_demux = dmux_ch.groupTuple() + /* * Process 3: Cat multiple file after demux, Filtering sequences shorter than min length and single barcoded reads */ @@ -315,18 +320,16 @@ process '03_Length_filtered' { publishDir "03_Length_filtered_${sample_id}", mode: params.publish_dir_mode input: - tuple val(sample_id), val(barcode_files), path(fastq_files) from grouped_demux + tuple val(sample_id), val(barcode_files), path(fastq_files) from grouped_demux + output: - tuple val(sample_id), path('*_QF_Dmux_minLF.fastq') into lenFilt_ch - + tuple val(sample_id), path('*_QF_Dmux_minLF.fastq') into lenFilt_ch script: - """ - cat ${fastq_files} > "${sample_id}_QF_Dmux.fastq" - obigrep -l $minLen -p 'forward_tag is not None and reverse_tag is not None' ${sample_id}_QF_Dmux.fastq > ${sample_id}_QF_Dmux_minLF.fastq - - """ - + """ + cat ${fastq_files} > "${sample_id}_QF_Dmux.fastq" + obigrep -l $minLen -p 'forward_tag is not None and reverse_tag is not None' ${sample_id}_QF_Dmux.fastq > ${sample_id}_QF_Dmux_minLF.fastq + """ } @@ -334,24 +337,25 @@ process '03_Length_filtered' { * Process 4: Split the file based on samples */ - process '04_splitSamples_relabel_Cat' { label 'obitools' publishDir "04_splitSamples_${sample_id}", mode: params.publish_dir_mode input: - tuple val(sample_id), path(fastqs) from lenFilt_ch + tuple val(sample_id), path(fastqs) from lenFilt_ch + output: - tuple val(sample_id), path("$sample_id/*.fastq") into split_ch + tuple val(sample_id), path("$sample_id/*.fastq") into split_ch + script: - """ - mkdir ${sample_id} - obisplit -t sample -u "noSampleID.fastq" $fastqs - mv *.fastq ${sample_id} - mv ${sample_id}/$fastqs .. - mv ${sample_id}/noSampleID.fastq noSampleID.fastq.ignore - """ + """ + mkdir ${sample_id} + obisplit -t sample -u "noSampleID.fastq" $fastqs + mv *.fastq ${sample_id} + mv ${sample_id}/$fastqs .. + mv ${sample_id}/noSampleID.fastq noSampleID.fastq.ignore + """ } @@ -359,84 +363,84 @@ process '04_splitSamples_relabel_Cat' { * Process 5: Relabel file for usearch */ - process '05_relabel_Cat' { - label 'usearch' + label 'usearch' - publishDir "${task.process}_${sample_id}", mode: params.publish_dir_mode - - input: - tuple val(sample_id), path(fastqs) from split_ch + publishDir "${task.process}_${sample_id}", mode: params.publish_dir_mode - output: - tuple val(sample_id), path("*.relabeled.fastq"), path("CountOfSeq.txt"), path("*_relabeled4Usearch.fastq") into addition_ch - tuple val(sample_id), path("*_upper.fasta") into relabel_ch + input: + tuple val(sample_id), path(fastqs) from split_ch + output: + tuple val(sample_id), path("*.relabeled.fastq"), path("CountOfSeq.txt"), path("*_relabeled4Usearch.fastq") into addition_ch + tuple val(sample_id), path("*_upper.fasta") into relabel_ch - script: - if(mode == 'usearch32') - """ - for files in ${fastqs} - do - label=\$(echo \$files | cut -d '/' -f 3 | cut -d '.' -f 1) - usearch -fastx_relabel \$files -prefix \${label}. -fastqout \${label}.relabeled.fastq - done + script: + if(mode == 'usearch32') + """ + for files in ${fastqs} + do + label=\$(echo \$files | cut -d '/' -f 3 | cut -d '.' -f 1) + usearch -fastx_relabel \$files -prefix \${label}. -fastqout \${label}.relabeled.fastq + done - for files in *.relabeled.fastq - do - name=\$(echo \$files | cut -d '/' -f '2' | cut -d '.' -f 1) - echo \${name} >> CountOfSeq.txt - grep "^@\${name}" \$files | wc -l >> CountOfSeq.txt - done - - cat *.relabeled.fastq > "${sample_id}_QF_Dmux_minLF_relabeled4Usearch.fastq" + for files in *.relabeled.fastq + do + name=\$(echo \$files | cut -d '/' -f '2' | cut -d '.' -f 1) + echo \${name} >> CountOfSeq.txt + grep "^@\${name}" \$files | wc -l >> CountOfSeq.txt + done + + cat *.relabeled.fastq > "${sample_id}_QF_Dmux_minLF_relabeled4Usearch.fastq" - usearch -fastx_get_sample_names *_relabeled4Usearch.fastq -output sample.txt + usearch -fastx_get_sample_names *_relabeled4Usearch.fastq -output sample.txt - usearch -fastq_filter *_relabeled4Usearch.fastq -fastaout ${sample_id}.fasta + usearch -fastq_filter *_relabeled4Usearch.fastq -fastaout ${sample_id}.fasta - awk '/^>/ {print(\$0)}; /^[^>]/ {print(toupper(\$0))}' *.fasta > ${sample_id}_upper.fasta + awk '/^>/ {print(\$0)}; /^[^>]/ {print(toupper(\$0))}' *.fasta > ${sample_id}_upper.fasta - """ - else if(mode == 'usearch64') - """ - for files in ${fastqs} - do - label=\$(echo \$files | cut -d '/' -f 3 | cut -d '.' -f 1) - $usearch64 -fastx_relabel \$files -prefix \${label}. -fastqout \${label}.relabeled.fastq - done + """ + else if(mode == 'usearch64') + """ + for files in ${fastqs} + do + label=\$(echo \$files | cut -d '/' -f 3 | cut -d '.' -f 1) + $usearch64 -fastx_relabel \$files -prefix \${label}. -fastqout \${label}.relabeled.fastq + done - for files in *.relabeled.fastq - do - name=\$(echo \$files | cut -d '/' -f '2' | cut -d '.' -f 1) - echo \${name} >> CountOfSeq.txt - grep "^@\${name}" \$files | wc -l >> CountOfSeq.txt - done - - cat *.relabeled.fastq > "${sample_id}_QF_Dmux_minLF_relabeled4Usearch.fastq" + for files in *.relabeled.fastq + do + name=\$(echo \$files | cut -d '/' -f '2' | cut -d '.' -f 1) + echo \${name} >> CountOfSeq.txt + grep "^@\${name}" \$files | wc -l >> CountOfSeq.txt + done + + cat *.relabeled.fastq > "${sample_id}_QF_Dmux_minLF_relabeled4Usearch.fastq" - $usearch64 -fastx_get_sample_names *_relabeled4Usearch.fastq -output sample.txt + $usearch64 -fastx_get_sample_names *_relabeled4Usearch.fastq -output sample.txt - $usearch64 -fastq_filter *_relabeled4Usearch.fastq -fastaout ${sample_id}.fasta + $usearch64 -fastq_filter *_relabeled4Usearch.fastq -fastaout ${sample_id}.fasta - awk '/^>/ {print(\$0)}; /^[^>]/ {print(toupper(\$0))}' *.fasta > ${sample_id}_upper.fasta - """ - } + awk '/^>/ {print(\$0)}; /^[^>]/ {print(toupper(\$0))}' *.fasta > ${sample_id}_upper.fasta + """ +} + // redirecting channel choice depending on whether or not --skipDemux parameter is set demux_channel = (params.skipDemux ? name_ch.combine(dmuxed_relabeled_input_ch) : relabel_ch) + /* * Process 6: Dereplication, ZOTUs creation, ZOTU table creation */ - process '06_Uniques_ZOTUs' { label 'usearch' echo true publishDir "${task.process}_${sample_id}", mode: params.publish_dir_mode + input: tuple val(sample_id), path(upper_fasta) from demux_channel @@ -444,34 +448,34 @@ process '06_Uniques_ZOTUs' { tuple val(sample_id), path("${sample_id}_Unq.fasta"), path("${sample_id}_zotus.fasta"), path("zotuTable.txt") into zotu_ch when: - !params.onlyDemux + !params.onlyDemux script: - if(mode == 'usearch32') - """ - usearch -fastx_uniques ${upper_fasta} -sizeout -fastaout "${sample_id}_Unq.fasta" + if(mode == 'usearch32') + """ + usearch -fastx_uniques ${upper_fasta} -sizeout -fastaout "${sample_id}_Unq.fasta" - usearch -unoise3 "${sample_id}_Unq.fasta" -zotus "${sample_id}_zotus.fasta" -tabbedout "${sample_id}_Unq_unoise3.txt" -minsize $minsize + usearch -unoise3 "${sample_id}_Unq.fasta" -zotus "${sample_id}_zotus.fasta" -tabbedout "${sample_id}_Unq_unoise3.txt" -minsize $minsize - usearch -otutab ${upper_fasta} -zotus ${sample_id}_zotus.fasta -otutabout zotuTable.txt -mapout zmap.txt - """ - else if(mode == 'usearch64') - """ - $usearch64 -fastx_uniques ${upper_fasta} -sizeout -fastaout "${sample_id}_Unq.fasta" + usearch -otutab ${upper_fasta} -zotus ${sample_id}_zotus.fasta -otutabout zotuTable.txt -mapout zmap.txt + """ + else if(mode == 'usearch64') + """ + $usearch64 -fastx_uniques ${upper_fasta} -sizeout -fastaout "${sample_id}_Unq.fasta" - $usearch64 -unoise3 "${sample_id}_Unq.fasta" -zotus "${sample_id}_zotus.fasta" -tabbedout "${sample_id}_Unq_unoise3.txt" -minsize $minsize + $usearch64 -unoise3 "${sample_id}_Unq.fasta" -zotus "${sample_id}_zotus.fasta" -tabbedout "${sample_id}_Unq_unoise3.txt" -minsize $minsize - $usearch64 -otutab ${upper_fasta} -zotus ${sample_id}_zotus.fasta -otutabout zotuTable.txt -mapout zmap.txt - """ - else if(mode == 'vsearch') - """ - $vsearch --derep_fulllength ${upper_fasta} --sizeout --output "${sample_id}_Unq.fasta" + $usearch64 -otutab ${upper_fasta} -zotus ${sample_id}_zotus.fasta -otutabout zotuTable.txt -mapout zmap.txt + """ + else if(mode == 'vsearch') + """ + $vsearch --derep_fulllength ${upper_fasta} --sizeout --output "${sample_id}_Unq.fasta" - $vsearch --cluster_unoise "${sample_id}_Unq.fasta" --centroids "${sample_id}_centroids.fasta" --minsize $minsize - $vsearch --uchime3_denovo "${sample_id}_centroids.fasta" --nonchimeras "${sample_id}_zotus.fasta" --relabel zotu + $vsearch --cluster_unoise "${sample_id}_Unq.fasta" --centroids "${sample_id}_centroids.fasta" --minsize $minsize + $vsearch --uchime3_denovo "${sample_id}_centroids.fasta" --nonchimeras "${sample_id}_zotus.fasta" --relabel zotu - $vsearch --usearch_global ${upper_fasta} --db "${sample_id}_zotus.fasta" --id 0.97 --otutabout zotuTable.txt - """ + $vsearch --usearch_global ${upper_fasta} --db "${sample_id}_zotus.fasta" --id 0.97 --otutabout zotuTable.txt + """ } @@ -483,39 +487,42 @@ process '07_blast' { label 'blast' publishDir "${task.process}_${sample_id}", mode: params.publish_dir_mode + input: - tuple val(sample_id), path(a), path(zotus_fasta), path(zotuTable) from zotu_ch + tuple val(sample_id), path(a), path(zotus_fasta), path(zotuTable) from zotu_ch + output: - tuple val(sample_id), path("${sample_id}_blast_Result.tab"), path(zotuTable), path("match_list.txt") into blast_ch + tuple val(sample_id), path("${sample_id}_blast_Result.tab"), path(zotuTable), path("match_list.txt") into blast_ch + script: - """ - blastn -task ${blast_task} \ - -db "${params.blast_db} ${params.custom_db}" \ - -outfmt "6 qseqid sseqid staxids sscinames scomnames sskingdoms pident length qlen slen mismatch gapopen gaps qstart qend sstart send stitle evalue bitscore qcovs qcovhsp" \ - -perc_identity $perc_identity -evalue $evalue \ - -best_hit_score_edge 0.05 -best_hit_overhang 0.25 \ - -qcov_hsp_perc $qcov -max_target_seqs $maxTarSeq \ - -query ${zotus_fasta} -out ${sample_id}_blast_Result.tab \ - -num_threads ${task.cpus} - - makeblastdb -in ${zotus_fasta} -parse_seqids -dbtype nucl -out ${sample_id}_zotus - - blastn -db ${sample_id}_zotus \ - -outfmt "6 qseqid sseqid pident" \ - -out match_list.txt -qcov_hsp_perc 80 \ - -perc_identity 84 -query ${zotus_fasta} \ - -num_threads ${task.cpus} - - """ + """ + blastn -task ${blast_task} \ + -db "${params.blast_db} ${params.custom_db}" \ + -outfmt "6 qseqid sseqid staxids sscinames scomnames sskingdoms pident length qlen slen mismatch gapopen gaps qstart qend sstart send stitle evalue bitscore qcovs qcovhsp" \ + -perc_identity $perc_identity -evalue $evalue \ + -best_hit_score_edge 0.05 -best_hit_overhang 0.25 \ + -qcov_hsp_perc $qcov -max_target_seqs $maxTarSeq \ + -query ${zotus_fasta} -out ${sample_id}_blast_Result.tab \ + -num_threads ${task.cpus} + + makeblastdb -in ${zotus_fasta} -parse_seqids -dbtype nucl -out ${sample_id}_zotus + + blastn -db ${sample_id}_zotus \ + -outfmt "6 qseqid sseqid pident" \ + -out match_list.txt -qcov_hsp_perc 80 \ + -perc_identity 84 -query ${zotus_fasta} \ + -num_threads ${task.cpus} + """ } + /* * Process 8: LULU curation */ - process '08_lulu' { label 'lulu' + publishDir "${task.process}_${sample_id}_minMatch${minMatch_lulu}", mode: params.publish_dir_mode input: @@ -532,26 +539,50 @@ process '08_lulu' { } - /* * Process 9: Taxonomy assignment with LCA */ process '09_taxonomyAssigned' { label 'lca_python3' + publishDir "${task.process}_${lca_output}_qCov${lca_qcov}_id${lca_pid}_diff${lca_diff}", mode: params.publish_dir_mode input: - tuple path(table), path(blastRes) from zotuTable_ch.combine(blastFile_ch) + tuple path(table), path(blastRes) from zotuTable_ch_a.combine(blastFile_ch) file lcaScript from lca_script output: - tuple path("interMediate_res.tab"), path("${lca_output}_qCov${lca_qcov}_id${lca_pid}_diff${lca_diff}.tab") into last_ch - + tuple path("interMediate_res.tab"), path("${lca_output}_qCov${lca_qcov}_id${lca_pid}_diff${lca_diff}.tab") into lca_ch script: """ python3 $lca_script ${table} ${blastRes} ${lca_qcov} ${lca_pid} ${lca_diff} ${lca_output}_qCov${lca_qcov}_id${lca_pid}_diff${lca_diff}.tab """ -} - +} + + +/* + * Process 10: Create phyloseq object + */ +process '10_create_phyloseq' { + label 'phyloseq' + + publishDir "${task.process}", mode: params.publish_dir_mode + + input: + tuple path(i), path(lca_table) from lca_ch + path(zotu_table) from zotuTable_ch_b + path(fasta_file) from fastaFile_ch + path(metadata_file) from metadataFile_ch + file phyloseqScript from phyloseq_script + val optimise from params.optimise_tree + + output: + path("eDNAFlow_phyloseq.rds") into last_ch + + script: + """ + Rscript $phyloseq_script -l $lca_table -z $zotu_table -f $fasta_file -m $metadata_file -c $task.cpus -p eDNAFlow_phyloseq.rds -o $optimise + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index b8ce652..fcb9769 100755 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,6 @@ resume = true trace { - fields = 'name,hash,status,exit,realtime,submit,%cpu,%mem' } @@ -23,63 +22,72 @@ trace { */ params { -reads = "*_{R1,R2}.fastq" -barcode = "*_bc.txt" -minQuality = "20" -minAlignLeng = "12" -minLen = "50" -primer_mismatch = "2" -minsize = "8" -maxTarSeq = "10" -perc_identity = "95" -evalue = "1e-3" -qcov = "100" -lulu = "lulu.R" -mode = "usearch32" -usearch64 = "" -vsearch = "" -blast_db = "" -custom_db = "" -blast_task = "blastn" -publish_dir_mode = "symlink" -bindDir = "" -singularityDir = "" - -lca_script = "LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py" -zotuTable = "" -blastFile = "" -lca_qcov = "100" -lca_pid = "97" -lca_diff = "1" -lca_output = "lca_taxAssigned_results" - -minMatch_lulu="84" - -test = false -help = false - -// Defaults only, expecting to be overwritten -max_memory = 128.GB -max_cpus = 16 -max_time = 240.h - - + reads = "*_{R1,R2}.fastq" + barcode = "*_bc.txt" + minQuality = "20" + minAlignLeng = "12" + minLen = "50" + primer_mismatch = "2" + minsize = "8" + maxTarSeq = "10" + perc_identity = "95" + evalue = "1e-3" + qcov = "100" + mode = "usearch32" + usearch64 = "" + vsearch = "" + blast_db = "" + custom_db = "" + blast_task = "blastn" + publish_dir_mode = "symlink" + bindDir = "" + singularityDir = "" + + zotuTable = "" + blastFile = "" + fastaFile = "" + metadataFile = "" + lca_qcov = "100" + lca_pid = "97" + lca_diff = "1" + lca_output = "lca_taxAssigned_results" + optimise_tree = false + + lca_script = "scripts/LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py" + lulu = "scripts/lulu.R" + phyloseq_script = "scripts/create_phyloseq_object.R" + dada2_script = "scripts/DADA2.R" + cutadapt_script = "scripts/demux_cutadapt.sh" + + minMatch_lulu="84" + + test = false + help = false + + // Defaults only, expecting to be overwritten + max_memory = 128.GB + max_cpus = 16 + max_time = 240.h } + // Load base.config by default for all pipelines // move default values for singularity and process there and then enable includeConfig 'conf/base.config' + // Load test parameters if required if (params.test) { includeConfig 'conf/initialTest.config' } + // Load profiles from conf/profiles.config profiles { includeConfig 'conf/profiles.config' } + // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { @@ -111,5 +119,4 @@ def check_max(obj, type) { return obj } } -} - +} \ No newline at end of file diff --git a/LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py b/scripts/LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py similarity index 100% rename from LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py rename to scripts/LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py diff --git a/LCA_taxonomyAssignment_scripts/working_function.py b/scripts/LCA_taxonomyAssignment_scripts/working_function.py similarity index 100% rename from LCA_taxonomyAssignment_scripts/working_function.py rename to scripts/LCA_taxonomyAssignment_scripts/working_function.py diff --git a/scripts/create_phyloseq_object.R b/scripts/create_phyloseq_object.R new file mode 100644 index 0000000..17806f3 --- /dev/null +++ b/scripts/create_phyloseq_object.R @@ -0,0 +1,164 @@ +#=========================================================================== +# Create phyloseq object (with phyloseq) +# based on here: https://joey711.github.io/phyloseq/import-data.html +# Seb Rauschert +# 22/07/2022 +# Modified: Jessica Pearce +# 04/11/2022 +# Modified: Adam Bennett +# 22/02/2023 +#=========================================================================== + +suppressPackageStartupMessages(library(phyloseq)) +suppressPackageStartupMessages(library(tidyverse)) +suppressPackageStartupMessages(library(readr)) +suppressPackageStartupMessages(library(optparse)) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(DECIPHER)) +suppressPackageStartupMessages(library(phangorn)) +suppressPackageStartupMessages(library(Biostrings)) +suppressPackageStartupMessages(library(seqinr)) + +# Define options for command line +option_list = list( + make_option(c("-l", "--lca_file"), action="store", default=NA, type='character', + help="Table produced by LCA script (e.g., '09_taxonomy_assigned/lca_taxAssigned_results_qCov100_id97_diff1.tab')"), + make_option(c("-z", "--zotu_file"), action="store", default=NA, type='character', + help="OTU or ZOTU table (e.g., '08_lulu/curated_zotuTable.tab')"), + make_option(c("-f", "--fasta_file"), action="store", default=NA, type='character', + help="fasta file (e.g., '06_Unique_ZOTUs/*.fasta_zotus.fasta')"), + make_option(c("-m", "--metadata_file"), action="store", default=NA, type='character', + help="metadata file in csv format (e.g., 'test_metadata.csv')"), + make_option(c("-c", "--cores"), action="store", default=NA, type='integer', + help="number of cores (e.g., 100)"), + make_option(c("-p", "--phyloseq_file"), action="store", default=NA, type='character', + help="The name of the output file"), + make_option(c("-o", "--optimise"), action="store", default=FALSE, type='logical', + help="TRUE or FALSE; setting this to TRUE will cause the script take longer to run")) + + +#........................................................................ +# Setup +#........................................................................ + +opt = parse_args(OptionParser(option_list=option_list)) + +lca_file <- opt$lca_file +zotu_file <- opt$zotu_file +fasta_file <- opt$fasta_file +metadata_file <- opt$metadata_file +cores <- opt$cores +phyloseq_file <- opt$phyloseq_file +optimise <- opt$optimise + +lca_tab <- read_tsv(lca_file) +otu_tab <- read_tsv(zotu_file) + +colnames(otu_tab)[1] <- "ASV" +colnames(lca_tab)[8] <- "ASV" +fasta <- read.fasta(fasta_file, as.string = TRUE, forceDNAtolower = FALSE) + +otu_tab["ASV_sequence"] <- NA + +for(row in 1:nrow(otu_tab)) { + ASV <- otu_tab[row, "ASV"] + otu_tab[row, "ASV_sequence"] <- fasta[[ASV$ASV]][1] +} + +asv_seq <- otu_tab %>% + select(ASV, ASV_sequence) + +merged_tab <- merge(lca_tab, asv_seq, by = "ASV", all.x = TRUE) + +taxa <- merged_tab +otu <- merged_tab +seq_tab <- merged_tab + +if (grep(".csv", metadata_file)){ + meta <- read_csv(metadata_file) +} else if (grep(".tsv", metadata_file)) { + meta <- read_tsv(metadata_file) +} + + +#........................................................................ +# Prepare metadata +#........................................................................ + +meta <- as.data.frame(meta) +rownames(meta) <- meta$sample_id +meta$sample_id <- NULL + + +#........................................................................ +# Prepare taxa data +#........................................................................ + +taxa["LCA"] <- "" +taxa %>% + select(ASV, domain, phylum, class, order, family, genus, species, LCA, ASV_sequence) -> taxa +taxa <- as.data.frame(taxa) + +# We need to fill the LCA column starting with 'species', then 'genus', etc +# until we reach a taxonomy level that isn't 'na' or 'dropped' +levels <- c("species", "genus", "family", "order", "class", "phylum", "domain") +for (row in 1:nrow(taxa)) { + LCA <- NA + level_ID <- 1 + while(LCA == "dropped" | is.na(LCA)) { + LCA <- taxa[row, levels[level_ID]] + level_ID <- level_ID + 1 + } + taxa[row, "LCA"] <- LCA +} + +rownames(taxa) <- taxa$ASV +taxa$ASV <- NULL +taxa <- as.matrix(taxa) + + +#........................................................................ +# Prepare otu data +#........................................................................ + +otu <- as.data.frame(otu) +rownames(otu) <- otu$ASV +otu[,c('ASV', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'Contam', 'ASV_sequence')] <- list(NULL) + + +#........................................................................ +# Create phylogenetic tree +#........................................................................ + +# Phylogenetic tree code based on code from +# https://ucdavis-bioinformatics-training.github.io/2021-May-Microbial-Community-Analysis/data_reduction/02-dada2 +DNA_set = DNAStringSet(seq_tab$ASV_sequence) +names(DNA_set) = paste0(seq_tab$ASV) + +alignment = AlignSeqs(DNA_set, anchor=NA, processors=cores) + +phang_align <- phyDat(as(alignment, "matrix"), type="DNA") +dm <- dist.ml(phang_align) +treeNJ <- NJ(dm) + +fit <- pml(treeNJ, data=phang_align) +fitGTR <- update(fit, k=4, inv=0.2) + +if (optimise) { + # Use 'optim.pml()' to optimise the model parameters + fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, rearrangement = "stochastic", control = pml.control(trace = 0)) +} + + +#........................................................................ +# Create phyloseq object +#........................................................................ + +OTU <- otu_table(otu, taxa_are_rows = TRUE) +TAX <- tax_table(taxa) +META <- sample_data(meta) +TREE <- phy_tree(fitGTR$tree) + +physeq <- phyloseq(OTU, TAX, META, TREE) + +saveRDS(physeq, file = paste0(phyloseq_file)) \ No newline at end of file diff --git a/lulu.R b/scripts/lulu.R similarity index 100% rename from lulu.R rename to scripts/lulu.R