mahsa-mousavi · a4000 · Feb 22, 2023 · Feb 23, 2023 · Feb 23, 2023 · Feb 23, 2023
diff --git a/README.md b/README.md
@@ -81,6 +81,8 @@ In folders 00 to 09 you will find soft link(s) to the final result files of each
 
 `09_taxonomyAssigned_lca_result_qCov#_id#_diff#`: Intermediate and final taxonomy assignment result files
 
+`10_create_phyloseq`: Phyloseq object created after taxonomy assignment
+
 `work`: Holds all the results, intermediate files, ...  
 
 `.nextflow`: Nextflow generated folder holding history info 
@@ -116,8 +118,8 @@ For single-end run:
 For paired-end run:
 `nextflow run eDNAFlow.nf --barcode 'pe_bc*'  --blast_db 'Path2TestBlastDataset/file.fasta' --custom_db 'path2/customDatabase/myDb' [OPTIONS]` 
 
-For running LCA taxonomy assignment script:
-`nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" [OPTIONS]`
+For running LCA taxonomy assignment and phyloseq scripts:
+`nextflow run eDNAFlow.nf --taxonomyAssignment --zotuTable "path2/curatedOruncurated_ZotuTable_file" --blastFile "path2/blastResult_file" --lca_output "my_lca_result" --fastaFile path2/zotus.fasta --metadataFile path2/metadata.csv [OPTIONS]`
 
 ## Description of run options
 eDNAFlow allows execution of all or parts of the pipeline as long as the correct file formats are provided. For example, the user may choose to run eDNAFlow on a raw file that hasn't been demultiplexed, or opt to provide an already demultiplexed file. Similarly, a user may have performed the clustering with a different algorithm (e.g. DADA2) and is only interested in using the lca script.   
@@ -162,6 +164,10 @@ For description of LCA script and required file formats see section below: LCA (
 
 `--blastFile "blastResult_file"`: Provide the blast result file name;
 
+`--fastaFile "zotus.fasta"`: Provide the ZOTU, OTU or ASV fasta file name;
+
+`--metadataFile "metadata.csv"`: Provide the metadata file name (there must be a `sample_id` column, the file can be .csv or .tsv);
+
 ***Optional***
 
 `--lca_qcov "percent"`: percent of query coverage; Default is 100
@@ -172,6 +178,8 @@ For description of LCA script and required file formats see section below: LCA (
 
 `--lca_output "string"`: Output file name; 
 
+`--optimise_tree`: It's a boolean, setting this to true will optimise the phylogenetic tree in the phyloseq object, but it will also significantly increase the running time of the pipeline;
+
 
 ### Skipping and/or isolating steps
 
@@ -231,7 +239,7 @@ This setting should be adjusted so higher threshold is employed for genetic mark
 
 `--mode 'usearch32'`: by default eDNAFlow uses the free version of usearch (i.e. usearch 32 bit version); if you have access to 64bit version it can be set via changing mode as `--mode 'usearch64'`; if you are using the `--skipDemux` option, then you also have the option of using the open source vsearch by setting mode as `--mode 'vsearch'`
 
-`--usearch64 'Path2/Usearch64/executable'`: if mode is set to usearch64, then this option has to be specified; the full path must point to usearch64 executable
+`--usearch64 'Path2/Usearch64/executable'`: if mode is set to usearch64, then this option has to be specified; the full path must point to usearch64 executable 
 
 `--vsearch 'Path2/vsearch/executable'`: if mode is set to vsearch, then this option has to be specified; the full path must point to vsearch executable
 

diff --git a/conf/base.config b/conf/base.config
@@ -23,6 +23,7 @@ singularity {
       withLabel: 'blast'                    { container = 'docker://ncbi/blast:2.10.1'  }
       withLabel: 'lulu'                     { container = 'docker://index.docker.io/mahsamousavi/lulu:2019' }
       withLabel: 'lca_python3'              { container = 'docker://python:3.6'}
+      withLabel: 'phyloseq'                 { container = 'docker://index.docker.io/adbennett/dada2_and_phyloseq:v0.1' }
 
       cache = 'lenient' 
 

diff --git a/eDNAFlow.nf b/eDNAFlow.nf
diff --git a/nextflow.config b/nextflow.config
@@ -12,7 +12,6 @@ resume = true
 
 
 trace {
-
   fields = 'name,hash,status,exit,realtime,submit,%cpu,%mem'
 }
 
@@ -23,63 +22,72 @@ trace {
  */
 
 params {
-reads   = "*_{R1,R2}.fastq"
-barcode = "*_bc.txt"
-minQuality = "20"
-minAlignLeng = "12"
-minLen = "50"
-primer_mismatch = "2"
-minsize = "8"
-maxTarSeq = "10"
-perc_identity = "95"
-evalue = "1e-3"
-qcov = "100"
-lulu = "lulu.R"
-mode = "usearch32"  
-usearch64 = ""   
-vsearch = ""
-blast_db = ""
-custom_db = ""
-blast_task = "blastn"
-publish_dir_mode = "symlink" 
-bindDir = ""
-singularityDir = ""
-
-lca_script = "LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py"
-zotuTable = ""
-blastFile = ""
-lca_qcov = "100"
-lca_pid = "97"
-lca_diff = "1"
-lca_output = "lca_taxAssigned_results"
-
-minMatch_lulu="84"
-
-test = false
-help = false
-
-// Defaults only, expecting to be overwritten
-max_memory = 128.GB
-max_cpus = 16
-max_time = 240.h
-
-
+  reads   = "*_{R1,R2}.fastq"
+  barcode = "*_bc.txt"
+  minQuality = "20"
+  minAlignLeng = "12"
+  minLen = "50"
+  primer_mismatch = "2"
+  minsize = "8"
+  maxTarSeq = "10"
+  perc_identity = "95"
+  evalue = "1e-3"
+  qcov = "100"
+  mode = "usearch32"  
+  usearch64 = ""   
+  vsearch = ""
+  blast_db = ""
+  custom_db = ""
+  blast_task = "blastn"
+  publish_dir_mode = "symlink" 
+  bindDir = ""
+  singularityDir = ""
+
+  zotuTable = ""
+  blastFile = ""
+  fastaFile = ""
+  metadataFile = ""
+  lca_qcov = "100"
+  lca_pid = "97"
+  lca_diff = "1"
+  lca_output = "lca_taxAssigned_results"
+  optimise_tree = false
+
+  lca_script = "scripts/LCA_taxonomyAssignment_scripts/runAssign_collapsedTaxonomy.py"
+  lulu = "scripts/lulu.R"
+  phyloseq_script = "scripts/create_phyloseq_object.R"
+  dada2_script = "scripts/DADA2.R"
+  cutadapt_script = "scripts/demux_cutadapt.sh"
+
+  minMatch_lulu="84"
+
+  test = false
+  help = false
+
+  // Defaults only, expecting to be overwritten
+  max_memory = 128.GB
+  max_cpus = 16
+  max_time = 240.h
 }
 
+
 // Load base.config by default for all pipelines
 // move default values for singularity and process there and then enable
 includeConfig 'conf/base.config'
 
+
 // Load test parameters if required
 if (params.test) {
   includeConfig 'conf/initialTest.config'
 }
 
+
 // Load profiles from conf/profiles.config
 profiles {
   includeConfig 'conf/profiles.config'
 }
 
+
 // Function to ensure that resource requirements don't go beyond
 // a maximum limit
 def check_max(obj, type) {
@@ -111,5 +119,4 @@ def check_max(obj, type) {
       return obj
     }
   }
-}
-
+}
diff --git a/...nt_scripts/runAssign_collapsedTaxonomy.py → ...nt_scripts/runAssign_collapsedTaxonomy.py b/...nt_scripts/runAssign_collapsedTaxonomy.py → ...nt_scripts/runAssign_collapsedTaxonomy.py
diff --git a/...omyAssignment_scripts/working_function.py → ...omyAssignment_scripts/working_function.py b/...omyAssignment_scripts/working_function.py → ...omyAssignment_scripts/working_function.py
diff --git a/scripts/create_phyloseq_object.R b/scripts/create_phyloseq_object.R
@@ -0,0 +1,164 @@
+#===========================================================================
+# Create phyloseq object (with phyloseq)
+# based on here: https://joey711.github.io/phyloseq/import-data.html
+# Seb Rauschert
+# 22/07/2022
+# Modified: Jessica Pearce
+# 04/11/2022
+# Modified: Adam Bennett
+# 22/02/2023
+#===========================================================================
+
+suppressPackageStartupMessages(library(phyloseq))
+suppressPackageStartupMessages(library(tidyverse))
+suppressPackageStartupMessages(library(readr))
+suppressPackageStartupMessages(library(optparse))
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(DECIPHER))
+suppressPackageStartupMessages(library(phangorn))
+suppressPackageStartupMessages(library(Biostrings))
+suppressPackageStartupMessages(library(seqinr))
+
+# Define options for command line
+option_list = list(
+  make_option(c("-l", "--lca_file"), action="store", default=NA, type='character',
+              help="Table produced by LCA script (e.g., '09_taxonomy_assigned/lca_taxAssigned_results_qCov100_id97_diff1.tab')"),
+  make_option(c("-z", "--zotu_file"), action="store", default=NA, type='character',
+              help="OTU or ZOTU table (e.g., '08_lulu/curated_zotuTable.tab')"),
+  make_option(c("-f", "--fasta_file"), action="store", default=NA, type='character',
+              help="fasta file (e.g., '06_Unique_ZOTUs/*.fasta_zotus.fasta')"),
+  make_option(c("-m", "--metadata_file"), action="store", default=NA, type='character',
+              help="metadata file in csv format (e.g., 'test_metadata.csv')"),          
+  make_option(c("-c", "--cores"), action="store", default=NA, type='integer',
+              help="number of cores (e.g., 100)"),
+  make_option(c("-p", "--phyloseq_file"), action="store", default=NA, type='character',
+              help="The name of the output file"),
+  make_option(c("-o", "--optimise"), action="store", default=FALSE, type='logical',
+              help="TRUE or FALSE; setting this to TRUE will cause the script take longer to run"))
+
+
+#........................................................................
+# Setup
+#........................................................................
+
+opt = parse_args(OptionParser(option_list=option_list))
+
+lca_file      <- opt$lca_file
+zotu_file     <- opt$zotu_file
+fasta_file    <- opt$fasta_file
+metadata_file <- opt$metadata_file
+cores         <- opt$cores
+phyloseq_file <- opt$phyloseq_file
+optimise      <- opt$optimise
+
+lca_tab <- read_tsv(lca_file)
+otu_tab   <- read_tsv(zotu_file)
+
+colnames(otu_tab)[1]   <- "ASV"
+colnames(lca_tab)[8]   <- "ASV"
+fasta <- read.fasta(fasta_file, as.string = TRUE, forceDNAtolower = FALSE)
+
+otu_tab["ASV_sequence"] <- NA
+
+for(row in 1:nrow(otu_tab)) {
+  ASV <- otu_tab[row, "ASV"]
+  otu_tab[row, "ASV_sequence"] <- fasta[[ASV$ASV]][1]
+}
+
+asv_seq <- otu_tab %>%
+  select(ASV, ASV_sequence)
+
+merged_tab <- merge(lca_tab, asv_seq, by = "ASV", all.x = TRUE)
+
+taxa     <- merged_tab
+otu      <- merged_tab
+seq_tab  <- merged_tab
+
+if (grep(".csv", metadata_file)){
+  meta <- read_csv(metadata_file)
+} else if (grep(".tsv", metadata_file)) {
+  meta <- read_tsv(metadata_file)
+}
+
+
+#........................................................................
+# Prepare metadata
+#........................................................................
+
+meta             <- as.data.frame(meta)
+rownames(meta)   <- meta$sample_id
+meta$sample_id <- NULL
+
+
+#........................................................................
+# Prepare taxa data
+#........................................................................
+
+taxa["LCA"] <- ""
+taxa %>%
+  select(ASV, domain, phylum, class, order, family, genus, species, LCA, ASV_sequence) -> taxa
+taxa        <- as.data.frame(taxa)
+
+# We need to fill the LCA column starting with 'species', then 'genus', etc
+# until we reach a taxonomy level that isn't 'na' or 'dropped'
+levels <- c("species", "genus", "family", "order", "class", "phylum", "domain")
+for (row in 1:nrow(taxa)) {
+  LCA      <- NA
+  level_ID <- 1
+  while(LCA == "dropped" | is.na(LCA)) {
+    LCA      <- taxa[row, levels[level_ID]]
+    level_ID <- level_ID + 1
+  }
+  taxa[row, "LCA"] <- LCA
+}
+
+rownames(taxa) <- taxa$ASV
+taxa$ASV       <- NULL
+taxa           <- as.matrix(taxa)
+
+
+#........................................................................
+# Prepare otu data
+#........................................................................
+
+otu           <- as.data.frame(otu)
+rownames(otu) <- otu$ASV
+otu[,c('ASV', 'domain', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'Contam', 'ASV_sequence')] <- list(NULL)
+
+
+#........................................................................
+# Create phylogenetic tree
+#........................................................................
+
+# Phylogenetic tree code based on code from
+# https://ucdavis-bioinformatics-training.github.io/2021-May-Microbial-Community-Analysis/data_reduction/02-dada2
+DNA_set = DNAStringSet(seq_tab$ASV_sequence)
+names(DNA_set) = paste0(seq_tab$ASV)
+
+alignment = AlignSeqs(DNA_set, anchor=NA, processors=cores)
+
+phang_align <- phyDat(as(alignment, "matrix"), type="DNA")
+dm          <- dist.ml(phang_align)
+treeNJ      <- NJ(dm)
+
+fit    <- pml(treeNJ, data=phang_align)
+fitGTR <- update(fit, k=4, inv=0.2)
+
+if (optimise) {
+  # Use 'optim.pml()' to optimise the model parameters
+  fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE, rearrangement = "stochastic", control = pml.control(trace = 0))
+}
+
+
+#........................................................................
+# Create phyloseq object
+#........................................................................
+
+OTU    <- otu_table(otu, taxa_are_rows = TRUE)
+TAX    <- tax_table(taxa)
+META   <- sample_data(meta)
+TREE   <- phy_tree(fitGTR$tree)
+
+physeq <- phyloseq(OTU, TAX, META, TREE)
+
+saveRDS(physeq, file = paste0(phyloseq_file))
diff --git a/lulu.R → scripts/lulu.R b/lulu.R → scripts/lulu.R