From 976c683eb95bcbe8f34dd220c6faa027110c41e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Thu, 20 Nov 2025 09:13:16 +0200 Subject: [PATCH 01/12] updated config files, added array size parameter for cluster execution --- conf/base.config | 6 ++++++ nextflow.config | 1 + 2 files changed, 7 insertions(+) diff --git a/conf/base.config b/conf/base.config index 1e69ac57..a8dc5396 100644 --- a/conf/base.config +++ b/conf/base.config @@ -59,8 +59,14 @@ process { withLabel:error_ignore { errorStrategy = 'ignore' } + withLabel:error_retry { errorStrategy = 'retry' maxRetries = 2 } + + withName: 'DRAM:ANNOTATE:CALL:.*|DRAM:ANNOTATE:DB_SEARCH:.*' { + array = params.array_size + } + } diff --git a/nextflow.config b/nextflow.config index fd933daf..38e50edd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -163,6 +163,7 @@ params { /* Process Options */ + array_size = 10 queue_size = 10 // This is the resource requirements for a single process // Not the limit to the total resources available to the pipeline From ef13eed33c6f04bb3ac219aec3b63d23b5bd44aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Thu, 20 Nov 2025 09:56:01 +0200 Subject: [PATCH 02/12] updated nextflow.config --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 38e50edd..d776e4b0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -164,7 +164,7 @@ params { /* Process Options */ array_size = 10 - queue_size = 10 + // queue_size = 10 // This is the resource requirements for a single process // Not the limit to the total resources available to the pipeline // Up to queue_size processes can run in parallel, of various sizes From d9a3bc64c9b6b529396d9e0ce3b2e6fc979fe3fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Fri, 21 Nov 2025 15:27:13 +0200 Subject: [PATCH 03/12] Swap output assignments for rRNA and tRNA collections --- subworkflows/local/collect_rna.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/subworkflows/local/collect_rna.nf b/subworkflows/local/collect_rna.nf index 16c723a0..7e0d51ce 100644 --- a/subworkflows/local/collect_rna.nf +++ b/subworkflows/local/collect_rna.nf @@ -72,8 +72,8 @@ workflow COLLECT_RNA { // Create sheet for rrnas from the collected rRNAs or provided rRNAs // Run RRNA_COLLECT to generate a combined TSV for all fastas RRNA_COLLECT( ch_collected_rRNAs ) - ch_rrna_sheet = RRNA_COLLECT.out.rrna_collected_out - ch_rrna_combined = RRNA_COLLECT.out.rrna_combined_out + ch_rrna_sheet = RRNA_COLLECT.out.rrna_combined_out + ch_rrna_combined = RRNA_COLLECT.out.rrna_collected_out } else { ch_rrna_sheet = default_sheet ch_rrna_combined = default_sheet @@ -82,8 +82,8 @@ workflow COLLECT_RNA { // Create sheet for trnas from the collected tRNAs or provided tRNAs // Run TRNA_COLLECT to generate a combined TSV for all fastas TRNA_COLLECT( ch_collected_tRNAs ) - ch_trna_sheet = TRNA_COLLECT.out.trna_collected_out - ch_trna_combined = TRNA_COLLECT.out.trna_combined_out + ch_trna_sheet = TRNA_COLLECT.out.trna_combined_out + ch_trna_combined = TRNA_COLLECT.out.trna_collected_out } else { ch_trna_sheet = default_sheet ch_trna_combined = default_sheet @@ -96,4 +96,4 @@ workflow COLLECT_RNA { ch_trna_sheet ch_trna_combined -} \ No newline at end of file +} From 63ea2689e5b9836680ef9e4675d0508953dc642c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Tue, 25 Nov 2025 14:04:20 +0200 Subject: [PATCH 04/12] Refactor distill script and configuration for improved clarity and functionality - Update groupby_column default value to "input_fasta" in distill.py - Adjust input paths in distill.nf for consistency - Enhance argument handling in SUMMARIZE process --- bin/distill.py | 53 +++++++++++++++----------------- conf/modules.config | 1 + modules/local/distill/distill.nf | 27 ++++++++++------ 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/bin/distill.py b/bin/distill.py index a6068826..360d791a 100755 --- a/bin/distill.py +++ b/bin/distill.py @@ -27,7 +27,6 @@ DISTILATE_SORT_ORDER_COLUMNS = [COL_HEADER, COL_SUBHEADER, COL_MODULE, COL_GENE_ID] EXCEL_MAX_CELL_SIZE = 32767 -FASTA_COLUMN = os.getenv('FASTA_COLUMN') DISTILL_DIR = Path(__file__).parent / "assets/forms/distill_sheets" @@ -74,7 +73,7 @@ def fill_a_frame(frame: pd.DataFrame): return pd.Series(counts, index=genome_summary_frame.index) - counts = annotations.groupby(groupby_column, sort=False).apply(fill_a_frame) + counts = annotations.groupby(groupby_column, sort=False)[annotations.columns].apply(fill_a_frame) genome_summary_frame = pd.concat([genome_summary_frame, counts.T], axis=1) return genome_summary_frame @@ -99,7 +98,7 @@ def fill_genome_summary_frame_gene_names(annotations, genome_summary_frame, grou return genome_summary_frame -def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN): +def summarize_rrnas(rrnas_df, groupby_column="input_fasta"): genome_rrna_dict = dict() for genome, frame in rrnas_df.groupby(groupby_column): genome_rrna_dict[genome] = Counter(frame['type']) @@ -113,7 +112,7 @@ def summarize_rrnas(rrnas_df, groupby_column=FASTA_COLUMN): return rrna_frame -def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column=FASTA_COLUMN): +def make_genome_summary(annotations, genome_summary_frame, logger, groupby_column="input_fasta"): summary_frames = list() # get ko summaries @@ -159,11 +158,11 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_fram frame.to_excel(writer, sheet_name=sheet, index=False) for extra_frame in extra_frames: if extra_frame is not None and not extra_frame.empty: - extra_frame.to_excel(writer, sheet_name=extra_frame[COL_SHEET].iloc[0], index=False) + extra_frame.to_excel(writer, sheet_name=extra_frame[COL_HEADER].iloc[0], index=False) # TODO: add assembly stats like N50, longest contig, total assembled length etc -def make_genome_stats(annotations, rrna_frame=None, trna_frame=None, quast_frame=None, groupby_column=FASTA_COLUMN): +def make_genome_stats(annotations, rrna_frame=None, trna_frame=None, quast_frame=None, groupby_column="input_fasta"): rows = list() columns = ['genome'] if 'scaffold' in annotations.columns: @@ -230,19 +229,19 @@ def make_genome_stats(annotations, rrna_frame=None, trna_frame=None, quast_frame @click.command() @click.option("-i", "--input_file", required=True, help="Annotations path") # @click.option("-o", "--output_dir", required=True, help="Directory to write summarized genomes") -@click.option("--rrna_path", help="rRNA output from annotation") -@click.option("--trna_path", help="tRNA output from annotation") -@click.option("--quast_path", help="Quast summary TSV from the quast step") +@click.option("--rrna_path", help="rRNA output from annotation", default=None, type=click.Path(exists=True)) +@click.option("--trna_path", help="tRNA output from annotation", default=None, type=click.Path(exists=True)) +@click.option("--quast_path", help="Quast summary TSV from the quast step", default=None, type=click.Path(exists=True)) @click.option("--groupby_column", help="Column from annotations to group as organism units", - default=FASTA_COLUMN) + default="input_fasta", type = click.STRING) @click.option("--distil_topics", default="default", help="Default distillates topics to run.") @click.option("--distil_ecosystem", default="eng_sys,ag", help="Default distillates ecosystems to run.") -@click.option("--custom_distillate", default=[], callback=validate_comma_separated, help="Custom distillate forms to add your own modules, comma separated. ") +@click.option("--custom_distillate", default="", callback=validate_comma_separated, help="Custom distillate forms to add your own modules, comma separated. ") @click.option("--distillate_gene_names", is_flag=True, show_default=True, default=False, help="Give names of genes instead of counts in genome metabolism summary") -def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby_column=FASTA_COLUMN, distil_topics=None, distil_ecosystem=None, - custom_distillate=None, distillate_gene_names=False): +def distill(input_file, rrna_path, trna_path, quast_path, groupby_column, distil_topics, distil_ecosystem, + custom_distillate, distillate_gene_names): """Summarize metabolic content of annotated genomes""" # make output folder # mkdir(output_dir) @@ -255,24 +254,20 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby # Check the columns are present check_columns(annotations, logger) - if trna_path is None: - trna_frame = None - else: + trna_frame = None + rrna_frame = None + if all([v is not None for v in [trna_path, rrna_path]]): trna_frame = pd.read_csv(trna_path, sep='\t') - if rrna_path is None: - rrna_frame = None - else: rrna_frame = pd.read_csv(rrna_path, sep='\t') - # Check NF DRAM didn't pass an empty sheet to signal no tRNAs or rRNAs - if rrna_frame.empty: - rrna_frame = None - if trna_frame.empty: - trna_frame = None - - if quast_path is None: - quast_frame = None - else: + if any(v.dropna(how="all").empty for v in [trna_frame, rrna_frame]): + trna_frame = None + rrna_frame = None + + quast_frame = None + if quast_path is not None: quast_frame = pd.read_csv(quast_path, sep='\t') + if quast_frame.dropna(how="all").empty: + quast_frame = None distil_sheets_names = [] if "default" in distil_topics: @@ -322,7 +317,7 @@ def distill(input_file, rrna_path=None, trna_path=None, quast_path=None, groupby genome_summary_form = genome_summary_form.reset_index(drop=True) # make genome stats - genome_stats = make_genome_stats(annotations, rrna_frame, trna_frame, quast_frame=quast_frame, groupby_column=groupby_column) + genome_stats = make_genome_stats(annotations, rrna_frame, trna_frame, quast_frame, groupby_column=groupby_column) genome_stats.to_csv('genome_stats.tsv', sep='\t', index=None) logger.info('Calculated genome statistics') diff --git a/conf/modules.config b/conf/modules.config index 3cdcf759..a867fb63 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -254,6 +254,7 @@ process { ] } withName: SUMMARIZE { + ext.args = { '--groupby_column "input_fasta"' } publishDir = [ [ path: "${params.outdir}/SUMMARIZE", diff --git a/modules/local/distill/distill.nf b/modules/local/distill/distill.nf index 3c772684..41a9024e 100644 --- a/modules/local/distill/distill.nf +++ b/modules/local/distill/distill.nf @@ -7,10 +7,10 @@ process SUMMARIZE { container "community.wave.seqera.io/library/python_pandas_openpyxl_click_dram-viz:bd6f4fb065d73a68" input: - path( ch_combined_annotations, stageAs: "raw-annotations.tsv" ) - path( ch_rrna_collected, stageAs: "rrna_combined.tsv" ) - path( ch_trna_collected, stageAs: "trna_combined.tsv" ) - path( ch_quast_stats ) + path( combined_annotations, stageAs: "raw-annotations.tsv" ) + path( rrna_collected, stageAs: "rrna_combined.tsv" ) + path( trna_collected, stageAs: "trna_combined.tsv" ) + path( quast_stats ) val( distill_topic ) val( distill_ecosystem ) val( distill_custom ) @@ -22,11 +22,20 @@ process SUMMARIZE { path( "genome_stats.tsv" ), emit: genome_stats script: - """ - # export constants for script - export FASTA_COLUMN="${params.CONSTANTS.FASTA_COLUMN}" - - distill.py -i ${ch_combined_annotations} --rrna_path '${ch_rrna_collected}' --trna_path '${ch_trna_collected}' --distil_topics "${distill_topic}" --distil_ecosystem "${distill_ecosystem}" --custom_distillate "${distill_custom}" --quast_path '${ch_quast_stats}' + def args = task.ext.args ?: "" + def rrna = rrna_collected ? "--rrna_path ${rrna_collected}" : "" + def trna = trna_collected ? "--trna_path ${trna_collected}" : "" + def quast = quast_stats ? "--trna_path ${quast_stats}" : "" """ + distill.py \ + -i ${combined_annotations} \ + ${rrna} \ + ${trna} \ + ${quast} \ + --distil_topics "${distill_topic}" \ + --distil_ecosystem "${distill_ecosystem}" \ + --custom_distillate "${distill_custom}" \ + ${args} + """ } From 74149799f48e05d97d7fa77ecb3ad1aa8448683b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Wed, 26 Nov 2025 10:41:52 +0200 Subject: [PATCH 05/12] Refactor input and output path definitions for consistency in the SUMMARIZE process --- modules/local/distill/distill.nf | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/modules/local/distill/distill.nf b/modules/local/distill/distill.nf index 41a9024e..94a5b94e 100644 --- a/modules/local/distill/distill.nf +++ b/modules/local/distill/distill.nf @@ -7,25 +7,25 @@ process SUMMARIZE { container "community.wave.seqera.io/library/python_pandas_openpyxl_click_dram-viz:bd6f4fb065d73a68" input: - path( combined_annotations, stageAs: "raw-annotations.tsv" ) - path( rrna_collected, stageAs: "rrna_combined.tsv" ) - path( trna_collected, stageAs: "trna_combined.tsv" ) - path( quast_stats ) - val( distill_topic ) - val( distill_ecosystem ) - val( distill_custom ) + path combined_annotations + path rrna_collected + path trna_collected + path quast_stats + val distill_topic + val distill_ecosystem + val distill_custom output: - path( "metabolism_summary.xlsx" ), emit: distillate - path( "*.log" ), emit: log - path( "summarized_genomes.tsv" ), emit: summarized_genomes - path( "genome_stats.tsv" ), emit: genome_stats + path "metabolism_summary.xlsx", emit: distillate + path "*.log", emit: log + path "summarized_genomes.tsv", emit: summarized_genomes + path "genome_stats.tsv", emit: genome_stats script: def args = task.ext.args ?: "" def rrna = rrna_collected ? "--rrna_path ${rrna_collected}" : "" def trna = trna_collected ? "--trna_path ${trna_collected}" : "" - def quast = quast_stats ? "--trna_path ${quast_stats}" : "" + def quast = quast_stats ? "--quast_path ${quast_stats}" : "" """ distill.py \ From a77c29e8205e302a85ebe8a03e79106962c2a6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Wed, 26 Nov 2025 12:33:07 +0200 Subject: [PATCH 06/12] Fix conditional check for gene columns in genome summary export to prevent errors --- bin/distill.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/distill.py b/bin/distill.py index 360d791a..232a8192 100755 --- a/bin/distill.py +++ b/bin/distill.py @@ -153,8 +153,9 @@ def write_summarized_genomes_to_xlsx(summarized_genomes, output_file, extra_fram frame = frame.sort_values(DISTILATE_SORT_ORDER_COLUMNS) frame = frame.drop([COL_SHEET], axis=1) gene_columns = list(set(frame.columns) - set(CONSTANT_DISTILLATE_COLUMNS)) - split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1) - frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS], split_genes], axis=1) + if gene_columns: + split_genes = pd.concat([split_names_to_long(frame[i].astype(str)) for i in gene_columns], axis=1) + frame = pd.concat([frame[CONSTANT_DISTILLATE_COLUMNS], split_genes], axis=1) frame.to_excel(writer, sheet_name=sheet, index=False) for extra_frame in extra_frames: if extra_frame is not None and not extra_frame.empty: @@ -199,7 +200,6 @@ def make_genome_stats(annotations, rrna_frame=None, trna_frame=None, quast_frame # Rename the index column to input_fasta (or whatever you want) df_rrna = df_rrna.rename(columns={"index": "genome"}) df_rrna.columns.name = None - print(df_rrna) genome_stats = pd.merge(genome_stats, df_rrna, how="outer", on="genome") if trna_frame is not None: meta_cols = ["gene_id", "gene_description", "category", From e8b0e95c51208b24cff41d23cbdb3e61218ca887 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Wed, 26 Nov 2025 14:27:45 +0200 Subject: [PATCH 07/12] Refactor channel usage for consistency across workflows and improve readability --- subworkflows/local/annotate.nf | 9 +-- subworkflows/local/call.nf | 4 +- subworkflows/local/collect_rna.nf | 8 +-- subworkflows/local/db_search.nf | 115 +++++++++++++++--------------- subworkflows/local/merge.nf | 6 +- subworkflows/local/qc.nf | 8 +-- workflows/dram.nf | 33 ++++----- 7 files changed, 86 insertions(+), 97 deletions(-) diff --git a/subworkflows/local/annotate.nf b/subworkflows/local/annotate.nf index bd30a51b..b231ac73 100644 --- a/subworkflows/local/annotate.nf +++ b/subworkflows/local/annotate.nf @@ -34,8 +34,8 @@ workflow ANNOTATE { ch_combined_annotations = default_sheet if (params.rename || call) { - fasta_name = ch_fasta.map { it[0] } - fasta_files = ch_fasta.map { it[1] } + fasta_name = ch_fasta.map { it-> it[0] } + fasta_files = ch_fasta.map { it -> it[1] } n_fastas = file("$params.input_fasta/${params.fasta_fmt}").size() } @@ -49,10 +49,7 @@ workflow ANNOTATE { // we use flatten here to turn a list back into a channel renamed_fasta_paths = RENAME_FASTA.out.renamed_fasta_paths.flatten() // we need to recreate the fasta channel with the renamed fasta files - ch_fasta = renamed_fasta_paths.map { - fasta_name = it.getBaseName() - tuple(fasta_name, it) - } + ch_fasta = renamed_fasta_paths.map { it -> [ it.getBaseName(), it ] } } ch_quast_stats = default_sheet diff --git a/subworkflows/local/call.nf b/subworkflows/local/call.nf index 74ae33d5..5f7c5dc6 100644 --- a/subworkflows/local/call.nf +++ b/subworkflows/local/call.nf @@ -38,13 +38,13 @@ workflow CALL { .set { ch_collected_faa } // Set the resulting list to ch_collected_faa // Collect all individual fasta to pass to quast - Channel.empty() + channel.empty() .mix( ch_called_genes ) .collect() .set { ch_collected_fna } // Collect all individual fasta to pass to quast - Channel.empty() + channel.empty() .mix( ch_filtered_fasta, ch_gene_gff ) .collect() .set { ch_collected_fasta } diff --git a/subworkflows/local/collect_rna.nf b/subworkflows/local/collect_rna.nf index 9105e83a..2c566b07 100644 --- a/subworkflows/local/collect_rna.nf +++ b/subworkflows/local/collect_rna.nf @@ -33,7 +33,7 @@ workflow COLLECT_RNA { // If we didn't run call if (!call) { if (params.rrnas) { - Channel.fromPath("${params.rrnas}/*.tsv", checkIfExists: true) + channel.fromPath("${params.rrnas}/*.tsv", checkIfExists: true) .ifEmpty { exit 1, "If you specify --distill_ without --call, you must provide individual rRNA files generated with barrnap. Cannot find any files at: ${params.rrnas}\nNB: Path needs to follow pattern: path/to/directory" } .collect() .set { ch_collected_rRNAs } @@ -42,7 +42,7 @@ workflow COLLECT_RNA { } if (params.trnas) { // the user provided rrnas or trnas - Channel.fromPath("${params.trnas}/*.tsv", checkIfExists: true) + channel.fromPath("${params.trnas}/*.tsv", checkIfExists: true) .ifEmpty { exit 1, "If you specify --distill_ without --call, you must provide individual tRNA files generated with tRNAscan-SE. Cannot find any files at: ${params.trnas}\nNB: Path needs to follow pattern: path/to/directory" } .collect() .set { ch_collected_tRNAs } @@ -53,14 +53,14 @@ workflow COLLECT_RNA { TRNA_SCAN( ch_fasta ) ch_trna_scan = TRNA_SCAN.out.trna_scan_out // Collect all input_fasta formatted tRNA files - Channel.empty() + channel.empty() .mix( ch_trna_scan ) .collect() .set { ch_collected_tRNAs } // Run barrnap on each fasta to identify rRNAs RRNA_SCAN( ch_fasta ) ch_rrna_scan = RRNA_SCAN.out.rrna_scan_out - Channel.empty() + channel.empty() .mix( ch_rrna_scan ) .collect() .set { ch_collected_rRNAs } diff --git a/subworkflows/local/db_search.nf b/subworkflows/local/db_search.nf index e93b3498..dc18a662 100644 --- a/subworkflows/local/db_search.nf +++ b/subworkflows/local/db_search.nf @@ -83,7 +83,7 @@ workflow DB_SEARCH { main: - DB_CHANNEL_SETUP( + DB_channel_SETUP( use_kegg, use_kofam, use_dbcan, @@ -125,23 +125,20 @@ workflow DB_SEARCH { if (!call) { - ch_called_proteins = Channel + ch_called_proteins = channel .fromPath(file(params.input_genes) / params.genes_fmt, checkIfExists: true) .ifEmpty { exit 1, "If you specify --annotate without --call, you must provide a fasta file of called genes using --input_genes. Cannot find any called gene fasta files matching: ${params.input_genes}\nNB: Path needs to follow pattern: path/to/directory/" } - .map { - input_fastaName = it.getBaseName() - tuple(input_fastaName, it) + .map { it -> [ it.getBaseName(), it ] } } - GENE_LOCS( ch_called_proteins) ch_gene_locs = GENE_LOCS.out.prodigal_locs_tsv n_fastas = file("$params.input_genes/${params.genes_fmt}").size() } - def formattedOutputChannels = channel.of() + def formattedOutputchannels = channel.of() // Here we will create mmseqs2 index files for each of the inputs if we are going to do a mmseqs2 database - if (DB_CHANNEL_SETUP.out.index_mmseqs) { + if (DB_channel_SETUP.out.index_mmseqs) { // Use MMSEQS2 to index each called genes protein file MMSEQS_INDEX( ch_called_proteins ) ch_mmseqs_query = MMSEQS_INDEX.out.mmseqs_index_out @@ -150,17 +147,17 @@ workflow DB_SEARCH { // KEGG annotation if (use_kegg) { ch_combined_query_locs_kegg = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_KEGG( ch_combined_query_locs_kegg, DB_CHANNEL_SETUP.out.ch_kegg_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, kegg_name ) + MMSEQS_SEARCH_KEGG( ch_combined_query_locs_kegg, DB_channel_SETUP.out.ch_kegg_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, kegg_name ) ch_kegg_unformatted = MMSEQS_SEARCH_KEGG.out.mmseqs_search_formatted_out SQL_KEGG(ch_kegg_unformatted, kegg_name, ch_sql_descriptions_db) ch_kegg_formatted = SQL_KEGG.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_kegg_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_kegg_formatted) } // KOFAM annotation if (use_kofam) { - HMM_SEARCH_KOFAM ( ch_called_proteins, params.kofam_e_value, DB_CHANNEL_SETUP.out.ch_kofam_db ) + HMM_SEARCH_KOFAM ( ch_called_proteins, params.kofam_e_value, DB_channel_SETUP.out.ch_kofam_db ) ch_kofam_hmms = HMM_SEARCH_KOFAM.out.hmm_search_out PARSE_HMM_KOFAM ( ch_kofam_hmms ) @@ -170,23 +167,23 @@ workflow DB_SEARCH { KOFAM_HMM_FORMATTER ( ch_combined_hits_locs_kofam, ch_kofam_list ) ch_kofam_formatted = KOFAM_HMM_FORMATTER.out.kofam_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_kofam_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_kofam_formatted) } // PFAM annotation if (use_pfam) { ch_combined_query_locs_pfam = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_PFAM( ch_combined_query_locs_pfam, DB_CHANNEL_SETUP.out.ch_pfam_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, pfam_name ) + MMSEQS_SEARCH_PFAM( ch_combined_query_locs_pfam, DB_channel_SETUP.out.ch_pfam_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, pfam_name ) ch_pfam_unformatted = MMSEQS_SEARCH_PFAM.out.mmseqs_search_formatted_out SQL_PFAM(ch_pfam_unformatted, pfam_name, ch_sql_descriptions_db) ch_pfam_formatted = SQL_PFAM.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_pfam_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_pfam_formatted) } // dbCAN annotation if (use_dbcan) { - HMM_SEARCH_DBCAN ( ch_called_proteins, params.dbcan_e_value , DB_CHANNEL_SETUP.out.ch_dbcan_db) + HMM_SEARCH_DBCAN ( ch_called_proteins, params.dbcan_e_value , DB_channel_SETUP.out.ch_dbcan_db) ch_dbcan_hmms = HMM_SEARCH_DBCAN.out.hmm_search_out PARSE_HMM_DBCAN ( ch_dbcan_hmms ) @@ -196,12 +193,12 @@ workflow DB_SEARCH { DBCAN_HMM_FORMATTER ( ch_combined_hits_locs_dbcan, dbcan_name, ch_sql_descriptions_db ) ch_dbcan_formatted = DBCAN_HMM_FORMATTER.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_dbcan_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_dbcan_formatted) } // CAMPER annotation if (use_camper) { // HMM - HMM_SEARCH_CAMPER ( ch_called_proteins, params.camper_e_value , DB_CHANNEL_SETUP.out.ch_camper_hmm_db) + HMM_SEARCH_CAMPER ( ch_called_proteins, params.camper_e_value , DB_channel_SETUP.out.ch_camper_hmm_db) ch_camper_hmms = HMM_SEARCH_CAMPER.out.hmm_search_out PARSE_HMM_CAMPER ( ch_camper_hmms ) @@ -211,18 +208,18 @@ workflow DB_SEARCH { CAMPER_HMM_FORMATTER ( ch_combined_hits_locs_camper, ch_camper_hmm_list ) ch_camper_hmm_formatted = CAMPER_HMM_FORMATTER.out.camper_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_camper_hmm_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_camper_hmm_formatted) // MMseqs ch_combined_query_locs_camper = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_CAMPER( ch_combined_query_locs_camper, DB_CHANNEL_SETUP.out.ch_camper_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_CHANNEL_SETUP.out.ch_camper_mmseqs_list, camper_name ) + MMSEQS_SEARCH_CAMPER( ch_combined_query_locs_camper, DB_channel_SETUP.out.ch_camper_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_channel_SETUP.out.ch_camper_mmseqs_list, camper_name ) ch_camper_mmseqs_formatted = MMSEQS_SEARCH_CAMPER.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_camper_mmseqs_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_camper_mmseqs_formatted) } // FeGenie annotation if (use_fegenie) { - HMM_SEARCH_FEGENIE ( ch_called_proteins, params.fegenie_e_value, DB_CHANNEL_SETUP.out.ch_fegenie_db ) + HMM_SEARCH_FEGENIE ( ch_called_proteins, params.fegenie_e_value, DB_channel_SETUP.out.ch_fegenie_db ) ch_fegenie_hmms = HMM_SEARCH_FEGENIE.out.hmm_search_out PARSE_HMM_FEGENIE ( ch_fegenie_hmms ) @@ -231,27 +228,27 @@ workflow DB_SEARCH { ch_combined_hits_locs_fegenie = ch_fegenie_parsed.join(ch_gene_locs) FEGENIE_HMM_FORMATTER ( ch_combined_hits_locs_fegenie ) ch_fegenie_formatted = FEGENIE_HMM_FORMATTER.out.fegenie_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_fegenie_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_fegenie_formatted) } // Methyl annotation if (use_methyl) { ch_combined_query_locs_methyl = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_METHYL( ch_combined_query_locs_methyl, DB_CHANNEL_SETUP.out.ch_methyl_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, methyl_name ) + MMSEQS_SEARCH_METHYL( ch_combined_query_locs_methyl, DB_channel_SETUP.out.ch_methyl_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, methyl_name ) ch_methyl_mmseqs_formatted = MMSEQS_SEARCH_METHYL.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_methyl_mmseqs_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_methyl_mmseqs_formatted) } // CANT-HYD annotation if (use_canthyd) { // MMseqs ch_combined_query_locs_canthyd = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_CANTHYD( ch_combined_query_locs_canthyd, DB_CHANNEL_SETUP.out.ch_canthyd_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_CHANNEL_SETUP.out.ch_canthyd_mmseqs_list, canthyd_name ) + MMSEQS_SEARCH_CANTHYD( ch_combined_query_locs_canthyd, DB_channel_SETUP.out.ch_canthyd_mmseqs_db, params.bit_score_threshold, params.rbh_bit_score_threshold, DB_channel_SETUP.out.ch_canthyd_mmseqs_list, canthyd_name ) ch_canthyd_mmseqs_formatted = MMSEQS_SEARCH_CANTHYD.out.mmseqs_search_formatted_out - formattedOutputChannels = formattedOutputChannels.mix(ch_canthyd_mmseqs_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_canthyd_mmseqs_formatted) //HMM - HMM_SEARCH_CANTHYD ( ch_called_proteins, params.canthyd_e_value , DB_CHANNEL_SETUP.out.ch_canthyd_hmm_db) + HMM_SEARCH_CANTHYD ( ch_called_proteins, params.canthyd_e_value , DB_channel_SETUP.out.ch_canthyd_hmm_db) ch_canthyd_hmms = HMM_SEARCH_CANTHYD.out.hmm_search_out PARSE_HMM_CANTHYD ( ch_canthyd_hmms ) @@ -261,12 +258,12 @@ workflow DB_SEARCH { CANTHYD_HMM_FORMATTER ( ch_combined_hits_locs_canthyd, ch_canthyd_hmm_list ) ch_canthyd_hmm_formatted = CANTHYD_HMM_FORMATTER.out.canthyd_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_canthyd_hmm_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_canthyd_hmm_formatted) } // Sulfur annotation if (use_sulfur) { - HMM_SEARCH_SULFUR ( ch_called_proteins, params.sulfur_e_value, DB_CHANNEL_SETUP.out.ch_sulfur_db ) + HMM_SEARCH_SULFUR ( ch_called_proteins, params.sulfur_e_value, DB_channel_SETUP.out.ch_sulfur_db ) ch_sulfur_hmms = HMM_SEARCH_SULFUR.out.hmm_search_out PARSE_HMM_SULFUR ( ch_sulfur_hmms ) @@ -276,33 +273,33 @@ workflow DB_SEARCH { SULFUR_HMM_FORMATTER ( ch_combined_hits_locs_sulfur ) ch_sulfur_formatted = SULFUR_HMM_FORMATTER.out.sulfur_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_sulfur_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_sulfur_formatted) } // MEROPS annotation if (use_merops) { ch_combined_query_locs_merops = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_MEROPS( ch_combined_query_locs_merops, DB_CHANNEL_SETUP.out.ch_merops_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, merops_name ) + MMSEQS_SEARCH_MEROPS( ch_combined_query_locs_merops, DB_channel_SETUP.out.ch_merops_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, merops_name ) ch_merops_unformatted = MMSEQS_SEARCH_MEROPS.out.mmseqs_search_formatted_out SQL_MEROPS(ch_merops_unformatted, merops_name, ch_sql_descriptions_db) ch_merops_formatted = SQL_MEROPS.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_merops_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_merops_formatted) } // Uniref annotation if (use_uniref) { ch_combined_query_locs_uniref = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_UNIREF( ch_combined_query_locs_uniref, DB_CHANNEL_SETUP.out.ch_uniref_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, uniref_name ) + MMSEQS_SEARCH_UNIREF( ch_combined_query_locs_uniref, DB_channel_SETUP.out.ch_uniref_db, params.bit_score_threshold, params.rbh_bit_score_threshold, default_sheet, uniref_name ) ch_uniref_unformatted = MMSEQS_SEARCH_UNIREF.out.mmseqs_search_formatted_out SQL_UNIREF(ch_uniref_unformatted, uniref_name, ch_sql_descriptions_db) ch_uniref_formatted = SQL_UNIREF.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_uniref_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_uniref_formatted) } // VOGdb annotation if (use_vog) { - HMM_SEARCH_VOG ( ch_called_proteins, params.vog_e_value , DB_CHANNEL_SETUP.out.ch_vogdb_db ) + HMM_SEARCH_VOG ( ch_called_proteins, params.vog_e_value , DB_channel_SETUP.out.ch_vogdb_db ) ch_vog_hmms = HMM_SEARCH_VOG.out.hmm_search_out PARSE_HMM_VOG ( ch_vog_hmms ) @@ -312,21 +309,21 @@ workflow DB_SEARCH { VOG_HMM_FORMATTER ( ch_combined_hits_locs_vog, vogdb_name, ch_sql_descriptions_db ) ch_vog_formatted = VOG_HMM_FORMATTER.out.vog_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_vog_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_vog_formatted) } // Viral annotation if (params.use_viral) { ch_combined_query_locs_viral = ch_mmseqs_query.join(ch_gene_locs) - MMSEQS_SEARCH_VIRAL( ch_combined_query_locs_viral, DB_CHANNEL_SETUP.out.ch_viral_db, params.bit_score_threshold, params.rbh_bit_score_threshold,default_sheet, viral_name ) + MMSEQS_SEARCH_VIRAL( ch_combined_query_locs_viral, DB_channel_SETUP.out.ch_viral_db, params.bit_score_threshold, params.rbh_bit_score_threshold,default_sheet, viral_name ) ch_viral_unformatted = MMSEQS_SEARCH_VIRAL.out.mmseqs_search_formatted_out SQL_VIRAL(ch_viral_unformatted, viral_name, ch_sql_descriptions_db) ch_viral_formatted = SQL_VIRAL.out.sql_formatted_hits - formattedOutputChannels = formattedOutputChannels.mix(ch_viral_formatted) + formattedOutputchannels = formattedOutputchannels.mix(ch_viral_formatted) } - fastas = formattedOutputChannels.map { it[1] }.collect() - genes = ch_called_proteins.map { it[1] }.collect() + fastas = formattedOutputchannels.map { it -> it[1] }.collect() + genes = ch_called_proteins.map { it -> it[1] }.collect() COMBINE_ANNOTATIONS( fastas, genes ) ch_combined_annotations = COMBINE_ANNOTATIONS.out.combined_annotations_out @@ -337,7 +334,7 @@ workflow DB_SEARCH { } -workflow DB_CHANNEL_SETUP { +workflow DB_channel_SETUP { take: use_kegg use_kofam @@ -356,24 +353,24 @@ workflow DB_CHANNEL_SETUP { main: index_mmseqs = false - ch_kegg_db = Channel.empty() - ch_kofam_db = Channel.empty() - ch_dbcan_db = Channel.empty() - ch_camper_hmm_db = Channel.empty() - ch_camper_mmseqs_db = Channel.empty() - ch_camper_mmseqs_list = Channel.empty() - ch_merops_db = Channel.empty() - ch_pfam_mmseqs_db = Channel.empty() - ch_heme_db = Channel.empty() - ch_sulfur_db = Channel.empty() - ch_uniref_db = Channel.empty() - ch_methyl_db = Channel.empty() - ch_fegenie_db = Channel.empty() - ch_canthyd_hmm_db = Channel.empty() - ch_canthyd_mmseqs_db = Channel.empty() - ch_canthyd_mmseqs_list = Channel.empty() - ch_vogdb_db = Channel.empty() - ch_viral_db = Channel.empty() + ch_kegg_db = channel.empty() + ch_kofam_db = channel.empty() + ch_dbcan_db = channel.empty() + ch_camper_hmm_db = channel.empty() + ch_camper_mmseqs_db = channel.empty() + ch_camper_mmseqs_list = channel.empty() + ch_merops_db = channel.empty() + ch_pfam_mmseqs_db = channel.empty() + ch_heme_db = channel.empty() + ch_sulfur_db = channel.empty() + ch_uniref_db = channel.empty() + ch_methyl_db = channel.empty() + ch_fegenie_db = channel.empty() + ch_canthyd_hmm_db = channel.empty() + ch_canthyd_mmseqs_db = channel.empty() + ch_canthyd_mmseqs_list = channel.empty() + ch_vogdb_db = channel.empty() + ch_viral_db = channel.empty() if (use_kegg) { ch_kegg_db = file(params.kegg_db).exists() ? file(params.kegg_db) : error("Error: If using --annotate, you must supply prebuilt databases. KEGG database file not found at ${params.kegg_db}") diff --git a/subworkflows/local/merge.nf b/subworkflows/local/merge.nf index 8b4f7ece..a14fa940 100644 --- a/subworkflows/local/merge.nf +++ b/subworkflows/local/merge.nf @@ -31,10 +31,10 @@ workflow MERGE { } // Create a channel with the paths to the .tsv files - Channel - .from(tsv_files.collect { annotations_dir.toString() + '/' + it }) + channel + .from(tsv_files.collect { it -> annotations_dir.toString() + '/' + it }) .set { ch_merge_annotations } - Channel.empty() + channel.empty() .mix( ch_merge_annotations ) .collect() .set { ch_merge_annotations_collected } diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf index 298c4fff..fa397491 100644 --- a/subworkflows/local/qc.nf +++ b/subworkflows/local/qc.nf @@ -49,15 +49,13 @@ workflow QC { if( params.generate_gff || params.generate_gbk ){ if (!call) { - ch_called_genes = Channel + ch_called_genes = channel .fromPath(file(params.input_genes) / params.genes_fna_fmt, checkIfExists: true) .ifEmpty { exit 1, "If you specify --generate_gff or --generate_gbk without --call, you must provide a fasta file of called genes using --input_genes and --genes_fna_fmt,. Cannot find any called gene fasta files matching: ${params.input_genes} and ${params.genes_fna_fmt}\nNB: Path needs to follow pattern: path/to/directory/" } - .map { - input_fastaName = it.getBaseName() - tuple(input_fastaName, it) + .map { it -> [ it.getBaseName(), it ] } } // Collect all individual fasta to pass to quast - Channel.empty() + channel.empty() .mix( ch_called_genes ) .collect() .set { ch_collected_fna } diff --git a/workflows/dram.nf b/workflows/dram.nf index fd4a5fc2..1dd909a0 100644 --- a/workflows/dram.nf +++ b/workflows/dram.nf @@ -37,9 +37,9 @@ workflow DRAM { // Setup // - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - ch_fasta = Channel.empty() + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + ch_fasta = channel.empty() default_sheet = file(params.distill_dummy_sheet) distill_flag = (params.summarize || params.distill_topic != "" || params.distill_ecosystem != "" || params.distill_custom != "" || params.sum_ecos != "") @@ -61,14 +61,11 @@ workflow DRAM { if (params.rename || call) { - ch_fasta = Channel + ch_fasta = channel .fromPath(file(params.input_fasta) / params.fasta_fmt, checkIfExists: true) .ifEmpty { exit 1, "Cannot find any fasta files matching: ${params.input_fasta}\nNB: Path needs to follow pattern: path/to/directory/" } - ch_fasta = ch_fasta.map { - fasta_name = it.getBaseName() - tuple(fasta_name, it) - } + ch_fasta = ch_fasta.map { it -> [ it.getBaseName(), it ] } } use_kegg = params.use_kegg @@ -85,7 +82,7 @@ workflow DRAM { use_vog = params.use_vog if (params.anno_dbs != "") { - anno_dbs = params.anno_dbs.tokenize(',').collect { it.trim().toLowerCase() } + anno_dbs = params.anno_dbs.tokenize(',').collect { it -> it.trim().toLowerCase() } value_for_all = 'all' use_kegg = getDBFlag(anno_dbs, 'kegg', value_for_all) use_kofam = getDBFlag(anno_dbs, 'kofam', value_for_all) @@ -241,7 +238,7 @@ workflow DRAM { ch_final_annots = ADD_ANNOTATIONS.out.combined_annots_out } } else { // If the user has not specified --annotate, use the provided annotations - ch_final_annots = Channel + ch_final_annots = channel .fromPath(params.annotations, checkIfExists: true) .ifEmpty { exit 1, "If you specify --distill_ without --annotate, you must provide an annotations TSV file (--annotations ) with approprite formatting. Cannot find any called gene files matching: ${params.annotations}\nNB: Path needs to follow pattern: path/to/directory/" } } @@ -263,7 +260,7 @@ workflow DRAM { } else if (params.annotations) { - ch_final_annots = Channel + ch_final_annots = channel .fromPath(params.annotations, checkIfExists: true) .ifEmpty { exit 1, "Parameter annotations problem: Cannot find any called gene files matching: ${params.annotations}\nNB: Path needs to follow pattern: path/to/directory/" } } @@ -293,24 +290,24 @@ workflow DRAM { // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath( + ch_multiqc_config = channel.fromPath( "$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_config, checkIfExists: true) : + channel.empty() ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_logo, checkIfExists: true) : + channel.empty() summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) ch_multiqc_files = ch_multiqc_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( + ch_methods_description = channel.value( methodsDescriptionText(ch_multiqc_custom_methods_description)) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) From 44187394ad3bb31e68992242feb75271515b94e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Thu, 27 Nov 2025 11:03:26 +0200 Subject: [PATCH 08/12] Update SUMMARIZE module to use parameterized fasta column for grouping --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a867fb63..342b35f5 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -254,7 +254,7 @@ process { ] } withName: SUMMARIZE { - ext.args = { '--groupby_column "input_fasta"' } + ext.args = { "--groupby_column ${params.CONSTANTS.FASTA_COLUMN}" } publishDir = [ [ path: "${params.outdir}/SUMMARIZE", From cd3b7ac1ca571b7b6eb153254f11b1f03086e782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Fri, 28 Nov 2025 10:53:04 +0200 Subject: [PATCH 09/12] Fix closure in QC workflow --- subworkflows/local/qc.nf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/subworkflows/local/qc.nf b/subworkflows/local/qc.nf index fa397491..fe7a5bf8 100644 --- a/subworkflows/local/qc.nf +++ b/subworkflows/local/qc.nf @@ -25,35 +25,35 @@ workflow QC { // Add Bin Quality to annotations - if( params.bin_quality ){ + if ( params.bin_quality ) { ch_bin_quality = file(params.bin_quality) ADD_BIN_QUALITY( ch_combined_annotations, ch_bin_quality ) ch_updated_annots = ADD_BIN_QUALITY.out.annots_bin_quality_out } - else{ + else { ch_updated_annots = ch_combined_annotations } // Add Taxonomy to annotations - if( params.taxa ){ + if ( params.taxa ) { ch_taxa = file(params.taxa) ADD_TAXA( ch_updated_annots, ch_taxa ) ch_updated_taxa_annots = ADD_TAXA.out.annots_taxa_out } - else{ + else { ch_updated_taxa_annots = ch_combined_annotations } ch_final_annots = ch_updated_taxa_annots - if( params.generate_gff || params.generate_gbk ){ + if ( params.generate_gff || params.generate_gbk ) { if (!call) { ch_called_genes = channel .fromPath(file(params.input_genes) / params.genes_fna_fmt, checkIfExists: true) .ifEmpty { exit 1, "If you specify --generate_gff or --generate_gbk without --call, you must provide a fasta file of called genes using --input_genes and --genes_fna_fmt,. Cannot find any called gene fasta files matching: ${params.input_genes} and ${params.genes_fna_fmt}\nNB: Path needs to follow pattern: path/to/directory/" } .map { it -> [ it.getBaseName(), it ] } - } + // Collect all individual fasta to pass to quast channel.empty() .mix( ch_called_genes ) From ed054bdb48531fc6a29b2e71fd21ea12b7130858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Fri, 28 Nov 2025 10:57:43 +0200 Subject: [PATCH 10/12] Fix closure in DB_SEARCH workflow --- subworkflows/local/db_search.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/db_search.nf b/subworkflows/local/db_search.nf index dc18a662..ea65cd01 100644 --- a/subworkflows/local/db_search.nf +++ b/subworkflows/local/db_search.nf @@ -129,7 +129,7 @@ workflow DB_SEARCH { .fromPath(file(params.input_genes) / params.genes_fmt, checkIfExists: true) .ifEmpty { exit 1, "If you specify --annotate without --call, you must provide a fasta file of called genes using --input_genes. Cannot find any called gene fasta files matching: ${params.input_genes}\nNB: Path needs to follow pattern: path/to/directory/" } .map { it -> [ it.getBaseName(), it ] } - } + GENE_LOCS( ch_called_proteins) ch_gene_locs = GENE_LOCS.out.prodigal_locs_tsv n_fastas = file("$params.input_genes/${params.genes_fmt}").size() From d39ff1405e0011c5e678ca4cb13773bdbac7661d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Mon, 1 Dec 2025 15:14:11 +0200 Subject: [PATCH 11/12] Updated combine_annotations.py to fix binwise summary. TODO: getting rid of FASTA_COLUMN environment variable --- bin/combine_annotations.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/bin/combine_annotations.py b/bin/combine_annotations.py index 3387f868..9029e8ca 100755 --- a/bin/combine_annotations.py +++ b/bin/combine_annotations.py @@ -14,15 +14,14 @@ logger = get_logger(filename=Path(__file__).stem) def read_and_preprocess(path: Path): - # We design input fastas from intermediate steps to be named like: "input_fasta___some_information_annotation_file.tsv" input_fasta = input_fasta_from_filepath(path) try: df = pd.read_csv(path) - df[FASTA_COLUMN] = input_fasta # Add input_fasta column + df[FASTA_COLUMN] = input_fasta return df except Exception as e: logger.error(f"Error loading DataFrame for input_fasta {input_fasta}: {str(e)}") - return pd.DataFrame() # Return an empty DataFrame in case of error + return pd.DataFrame() def input_fasta_from_filepath(file_path: Path): return file_path.stem.split("___")[0] @@ -51,7 +50,6 @@ def count_motifs(gene_faa, motif="(C..CH)", genes_faa_dict=None): for seq in read_sequence(gene_faa, format="fasta"): if seq.metadata["id"] not in genes_faa_dict: genes_faa_dict[seq.metadata["id"]] = {} - genes_faa_dict[seq.metadata["id"]]["heme_regulatory_motif_count"] = len(list(seq.find_with_regex(motif))) return genes_faa_dict @@ -61,12 +59,10 @@ def set_gene_data(gene_faa, genes_faa_dict=None): for seq in read_sequence(gene_faa, format="fasta"): if seq.metadata["id"] not in genes_faa_dict: genes_faa_dict[seq.metadata["id"]] = {} - split_label = seq.metadata["id"].split("_") gene_position = split_label[-1] start_position, end_position, strandedness = seq.metadata["description"].split("#")[1:4] - - genes_faa_dict[seq.metadata["id"]][FASTA_COLUMN] = os.path.commonprefix([Path(gene_faa).stem, seq.metadata["id"]]).rstrip("_") + genes_faa_dict[seq.metadata["id"]][FASTA_COLUMN] = str(Path(gene_faa).stem).replace('_called_genes', '') genes_faa_dict[seq.metadata["id"]]["scaffold"] = ( seq.metadata["id"] .removeprefix(genes_faa_dict[seq.metadata["id"]][FASTA_COLUMN]) @@ -83,16 +79,13 @@ def organize_columns(df, special_columns=None): special_columns = [] base_columns = ['query_id', FASTA_COLUMN, "scaffold", 'gene_number', 'start_position', 'stop_position', 'strandedness', 'rank'] base_columns = [col for col in base_columns if col in df.columns] - kegg_columns = sorted([col for col in df.columns if col.startswith('kegg_')], key=lambda x: (x != 'kegg_id', x)) other_columns = [col for col in df.columns if col not in base_columns + kegg_columns + special_columns] - db_prefixes = set(col.split('_')[0] for col in other_columns) sorted_other_columns = [] for prefix in db_prefixes: prefixed_columns = sorted([col for col in other_columns if col.startswith(prefix + '_')], key=lambda x: (x != f"{prefix}_id", x)) sorted_other_columns.extend(prefixed_columns) - final_columns_order = base_columns + kegg_columns + sorted_other_columns + special_columns return df[final_columns_order] @@ -106,11 +99,10 @@ def combine_annotations(annotations_dir, genes_dir, output, threads): annotations = Path(annotations_dir).glob("*") genes_faa = Path(genes_dir).glob("*") with ThreadPoolExecutor(max_workers=threads) as executor: - # futures = [executor.submit(read_and_preprocess, input_fasta, path) for input_fasta, path in input_fastas_and_paths] futures = [executor.submit(read_and_preprocess, Path(path)) for path in annotations] data_frames = [future.result() for future in as_completed(futures)] - combined_data = pd.concat(data_frames, ignore_index=True) + combined_data = pd.concat([df for df in data_frames if not df.empty], ignore_index=True) if genes_faa: genes_faa_dict = dict() for gene_path in genes_faa: @@ -118,13 +110,9 @@ def combine_annotations(annotations_dir, genes_dir, output, threads): genes_faa_dict count_motifs(gene_path, "(C..CH)", genes_faa_dict=genes_faa_dict) set_gene_data(gene_path, genes_faa_dict) - df = pd.DataFrame.from_dict(genes_faa_dict, orient='index') - combined_data = combined_data.drop(columns=df.columns, errors='ignore') - df.index.name = 'query_id' - - # we use outer to get any genes that don't have hits - combined_data = pd.merge(combined_data, df, how="outer", on="query_id") - combined_data.loc[combined_data[FASTA_COLUMN].isna(), FASTA_COLUMN] = "" + df = pd.DataFrame.from_dict(genes_faa_dict, orient='index').reset_index().rename(columns={'index': 'query_id'}) + combined_data = combined_data.drop(columns=df.columns.difference(["query_id", "scaffold", FASTA_COLUMN]), errors='ignore') + combined_data = pd.merge(combined_data, df, how="outer", on=["query_id", FASTA_COLUMN]) combined_data = convert_bit_scores_to_numeric(combined_data) From 64ab39ea7beb23743f7c11ead73ed8e9bc449791 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20P=C3=A4ll?= Date: Thu, 18 Dec 2025 15:20:22 +0200 Subject: [PATCH 12/12] Add QC:COLLECT_RNA to array pattern --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index a8dc5396..460af8ca 100644 --- a/conf/base.config +++ b/conf/base.config @@ -65,7 +65,7 @@ process { maxRetries = 2 } - withName: 'DRAM:ANNOTATE:CALL:.*|DRAM:ANNOTATE:DB_SEARCH:.*' { + withName: 'DRAM:ANNOTATE:CALL:.*|DRAM:ANNOTATE:DB_SEARCH:.*|DRAM:ANNOTATE:QC:COLLECT_RNA:.*' { array = params.array_size }