sanger-pathogens · yazhinia · Dec 1, 2025 · Dec 1, 2025 · Dec 1, 2025 · Dec 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,4 @@
 work/
 results/
 
-.vscode
+.vscode
diff --git a/README.md b/README.md
@@ -158,7 +158,7 @@ Manifests supplied as an argument to `--manifest`, should be of of the following
 
 ```console
 ID,REP,R1,R2
-sample,1,./test_data/inputs/sample_rep1_1.fastq.gz,./test_data/inputs/sample_rep2_2.fastq.gz
+sample,1,./test_data/inputs/sample_rep1_1.fastq.gz,./test_data/inputs/sample_rep1_2.fastq.gz
 sample,2,./test_data/inputs/sample_rep2_1.fastq.gz,./test_data/inputs/sample_rep2_2.fastq.gz
 ```
 

diff --git a/bin/plot_annotation_coverage.py b/bin/plot_annotation_coverage.py
@@ -71,7 +71,7 @@ def load_data(sample_id: str, data_dir: Path, normalize: bool = False) -> tuple[
 
 
 def get_gene_annotations(gff: Path) -> pd.DataFrame:
-    ann_base = pd.read_csv(gff, sep="\t", skiprows=3, header=None)
+    ann_base = pd.read_csv(gff, sep="\t", header=None, comment="#", dtype={0: "string"} )
     # Get gene entries from GFF
     ann_base = ann_base[ann_base[2] == 'gene']
     # Subset "start", "end", "strand", "tags"

diff --git a/main.nf b/main.nf
@@ -56,6 +56,7 @@ def validate_custom_params(params, log, monochrome_logs) {
     errors += ParamValidator.validate_no_invalid_args("--bowtie2_args", params.bowtie2_args, ["-x", "-1", "-2", "-p", "-S"], log)
     errors += ParamValidator.validate_only_valid_args("--samtools_filter_args", params.samtools_filter_args, ["-f", "-F", "--rf", "-G", "-e"], log)
     errors += ParamValidator.validate_no_invalid_args("--htseq_args", params.htseq_args, ["--samout", "--samout-format", "--order", "--stranded", "--counts_output"], log)
+    errors += ParamValidator.validate_no_invalid_args("--featurecounts_args", params.featurecounts_args, ["-s", "--strandness", "-o", "--output", "-a", "--annotation", "-T", "--threads"], log)
     log.info("${colors.reset}")
 
     if (errors > 0) {

diff --git a/modules/featurecounts.nf b/modules/featurecounts.nf
@@ -0,0 +1,140 @@
+
+process FEATURECOUNTS_COUNT {
+    tag "${meta.ID} : REP${meta.REP}"
+    label 'cpu_1'
+    label 'time_1'
+    label 'mem_16'
+
+    conda "bioconda::subread=2.1.1"
+    container 'quay.io/biocontainers/subread:2.1.1--h577a1d6_0'
+
+     /*
+     * Different output folders / patterns
+     * - counts tables go to featurecounts/
+     * - any BAM outputs (if added) can go to featurecounts/bam/
+     */
+
+    publishDir "${params.outdir}/featurecounts",
+        mode: 'copy',
+        overwrite: true,
+        pattern: "*_featurecounts.tsv"
+
+    publishDir "${params.outdir}/featurecounts/bam",
+        mode: 'copy',
+        overwrite: true,
+        pattern: "*_annotated.bam",
+        enabled: params.annotate_feature_assignment
+
+    input:
+    tuple val(meta), path(mapped_reads), path(annotation)
+
+    output:
+    tuple val(meta), path(count_table),  emit: sample_feature_counts
+    tuple val(meta), path(annotated_bam), optional: true,  emit: annotated_bam
+
+    script:
+    output_stem = "${meta.ID}_REP${meta.REP}"
+    count_table = "${output_stem}_featurecounts.tsv"
+    annotated_bam = "${output_stem}_annotated.bam"
+
+    /*
+     * Decision on strandedness for featureCounts
+     *  -s 0 : unstranded
+     *  -s 1 : forward stranded 5' to 3'
+     *  -s 2 : reversely stranded 3' to 5'
+     */
+
+    switch (params.library_strandedness) {
+        case "none":
+            fc_strandedness = "0"
+            break
+        case "forward":
+            fc_strandedness = "1"
+            break
+        case "reverse":
+            fc_strandedness = "2"
+            break
+        default:
+            log.error "Unrecognised parameter value for strandedness: ${params.library_strandedness}"
+    }
+
+    if (params.annotate_feature_assignment) {
+        """
+        featureCounts \\
+            -a ${annotation} \\
+            -o ${count_table} \\
+            -s ${fc_strandedness} \\
+            -T ${task.cpus} \\
+            ${params.featurecounts_args} \\
+            ${mapped_reads}
+        cp ${mapped_reads} ${annotated_bam}
+        """
+    } else {
+        """
+        featureCounts \\
+            -a ${annotation} \\
+            -o ${count_table} \\
+            -s ${fc_strandedness} \\
+            -T ${task.cpus} \\
+            ${params.featurecounts_args} \\
+            ${mapped_reads}
+        """
+    }
+}
+
+process COMBINE_FEATURECOUNTS {
+    label 'cpu_1'
+    label 'time_1'
+    label 'mem_16'
+
+    publishDir "${params.outdir}/featurecounts", mode: 'copy', overwrite: true
+
+    container 'ubuntu:22.04'
+
+    input:
+    path(count_tables)
+
+    output:
+    path(counts_table),  emit: all_feature_counts
+
+    script:
+    counts_table = "gene_counts.tsv"
+
+    """
+    files=( *_featurecounts.tsv )
+    # Extract gene list from first count file (column 1)
+
+    if [ \${#files[@]} -eq 0 ]; then
+      echo "No *_featurecounts.tsv files found" >&2
+      exit 1
+    fi
+
+    # Extract sample names from filenames
+    samples=()
+    for f in "\${files[@]}"; do
+        # Remove path + suffix (_featurecounts.tsv)
+        base=\$(basename "\$f" | sed 's/_featurecounts.tsv//')
+        samples+=( "\$base" )
+    done
+
+    # Write output
+    printf "feature_id" > ${counts_table}
+
+    for s in "\${samples[@]}"; do
+        printf "\\t%s" "\$s" >> ${counts_table}
+    done
+    printf "\\n" >> ${counts_table}
+
+    # Build table
+    cut -f1 "\${files[0]}" | grep -v '^#' | tail -n +2 | while IFS= read -r gene; do
+    printf '%s' "\$gene"
+    for f in "\${files[@]}"; do
+        count=\$(awk -F'\t' -v g="\$gene" '\$1==g && NR>1 {print \$7; exit}' "\$f")
+        printf '\t%s' "\${count:-0}"
+    done
+    printf '\n'
+    done >> "${counts_table}"
+
+    """
+}
+
diff --git a/modules/htseq.nf b/modules/htseq.nf
@@ -1,8 +1,8 @@
 process HTSEQ_COUNT {
     tag "${meta.ID} : REP${meta.REP}"
     label 'cpu_1'
-    label 'mem_100M'
     label 'time_1'
+    label 'mem_16'
 
     conda "bioconda::htseq=2.0.5"
     container 'quay.io/biocontainers/htseq:2.0.5--py310h5aa3a86_0'
@@ -14,8 +14,8 @@ process HTSEQ_COUNT {
     tuple val(meta), path(mapped_reads), path(annotation)
 
     output:
-    tuple val(meta), path("${count_table}"),  emit: sample_feature_counts
-    tuple val(meta), path("${annotated_bam}"), optional: true,  emit: annotated_bam
+    tuple val(meta), path(count_table),  emit: sample_feature_counts
+    tuple val(meta), path(annotated_bam), optional: true,  emit: annotated_bam
 
     script:
     output_stem = "${meta.ID}_REP${meta.REP}"
@@ -59,18 +59,18 @@ process HTSEQ_COUNT {
 
 process COMBINE_HTSEQ {
     label 'cpu_1'
-    label 'mem_100M'
     label 'time_1'
+    label 'mem_16'
 
-    publishDir "${params.outdir}/htseq", mode: 'copy', overwrite: true, pattern: "*_counts.tsv"
+    publishDir "${params.outdir}/htseq", mode: 'copy', overwrite: true
 
     container 'ubuntu:22.04'
 
     input:
     path(count_tables)
 
     output:
-    path("${counts_table}"),  emit: all_feature_counts
+    path(counts_table),  emit: all_feature_counts
 
     script:
     counts_table = "gene_counts.tsv"

diff --git a/nextflow.config b/nextflow.config
@@ -14,7 +14,7 @@ params {
     manifest = ""
     reference = ""
     annotation = ""
-    library_strandedness = ""
+    library_strandedness = "none"
 
     // Output
     outdir = "./results"
@@ -42,9 +42,16 @@ params {
     min_mapping_quality = 2
 
     // Counting
+    count_method = "htseq" // htseq (default) or featurecounts
     htseq_args = "--type gene --idattr locus_tag --nonunique none --secondary-alignments ignore"
     annotate_feature_assignment = false
-
+
+    // Default featureCounts arguments
+    // -p   : paired-end data
+    // -B   : require both mates properly aligned
+    // -C   : do not count discordant mapping
+    featurecounts_args = "-p -B -C"
+
     // Coverage
     skip_strand_specific_analysis = false
     coverage_window_size = 100
@@ -112,4 +119,4 @@ def load_nfcore_profile_config() {
         System.err.println("Encountered the following exception:")
         throw e
     }
-}
+}
diff --git a/subworkflows/count_reads.nf b/subworkflows/count_reads.nf
@@ -8,6 +8,12 @@ include {
     COMBINE_HTSEQ
 } from '../modules/htseq'
 
+include {
+    FEATURECOUNTS_COUNT;
+    COMBINE_FEATURECOUNTS
+} from '../modules/featurecounts'
+
+
 workflow COUNT_READS {
     take:
     ch_reads_to_filter
@@ -36,15 +42,31 @@ workflow COUNT_READS {
     // COUNTING
     ch_filtered_reads
         .combine(annotation)
-        .set {htseq_input}
-    HTSEQ_COUNT(htseq_input)
-    HTSEQ_COUNT.out.sample_feature_counts
-        .map { ID, count_table -> count_table }
-        .collect()
-        .set { ch_count_tables }
+        .set {ch_count_input}
+
+    if (params.count_method == "featurecounts") {
+
+        FEATURECOUNTS_COUNT(ch_count_input)
+        FEATURECOUNTS_COUNT.out.sample_feature_counts
+            .map { meta, count_table -> count_table }
+            .collect()
+            .set { ch_count_tables }
 
-    COMBINE_HTSEQ(ch_count_tables)
+        COMBINE_FEATURECOUNTS(ch_count_tables)
+
+    } else if (params.count_method == 'htseq') {
+
+        HTSEQ_COUNT(ch_count_input)
+        HTSEQ_COUNT.out.sample_feature_counts
+            .map { meta, count_table -> count_table }
+            .collect()
+            .set { ch_count_tables }
 
+        COMBINE_HTSEQ(ch_count_tables)
+    } else {
+        error "Unknown value for params.count_method: '${params.count_method}'. Use 'htseq' or 'featurecounts'."
+    }
+
     emit:
     ch_samtools_stats
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,4 +2,4 @@ @@
     work/
     results/
-    .vscode
+    .vscode