Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
work/
results/

.vscode
.vscode
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Manifests supplied as an argument to `--manifest`, should be of of the following

```console
ID,REP,R1,R2
sample,1,./test_data/inputs/sample_rep1_1.fastq.gz,./test_data/inputs/sample_rep2_2.fastq.gz
sample,1,./test_data/inputs/sample_rep1_1.fastq.gz,./test_data/inputs/sample_rep1_2.fastq.gz
sample,2,./test_data/inputs/sample_rep2_1.fastq.gz,./test_data/inputs/sample_rep2_2.fastq.gz
```

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may need to also include a new help message into the README after the nextflow_schema.json changes have been made.

Expand Down
2 changes: 1 addition & 1 deletion bin/plot_annotation_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def load_data(sample_id: str, data_dir: Path, normalize: bool = False) -> tuple[


def get_gene_annotations(gff: Path) -> pd.DataFrame:
ann_base = pd.read_csv(gff, sep="\t", skiprows=3, header=None)
ann_base = pd.read_csv(gff, sep="\t", header=None, comment="#", dtype={0: "string"} )
# Get gene entries from GFF
ann_base = ann_base[ann_base[2] == 'gene']
# Subset "start", "end", "strand", "tags"
Expand Down
1 change: 1 addition & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def validate_custom_params(params, log, monochrome_logs) {
errors += ParamValidator.validate_no_invalid_args("--bowtie2_args", params.bowtie2_args, ["-x", "-1", "-2", "-p", "-S"], log)
errors += ParamValidator.validate_only_valid_args("--samtools_filter_args", params.samtools_filter_args, ["-f", "-F", "--rf", "-G", "-e"], log)
errors += ParamValidator.validate_no_invalid_args("--htseq_args", params.htseq_args, ["--samout", "--samout-format", "--order", "--stranded", "--counts_output"], log)
errors += ParamValidator.validate_no_invalid_args("--featurecounts_args", params.featurecounts_args, ["-s", "--strandness", "-o", "--output", "-a", "--annotation", "-T", "--threads"], log)
log.info("${colors.reset}")

if (errors > 0) {
Expand Down
140 changes: 140 additions & 0 deletions modules/featurecounts.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@

process FEATURECOUNTS_COUNT {
tag "${meta.ID} : REP${meta.REP}"
label 'cpu_1'
label 'time_1'
label 'mem_16'

conda "bioconda::subread=2.1.1"
container 'quay.io/biocontainers/subread:2.1.1--h577a1d6_0'

/*
* Different output folders / patterns
* - counts tables go to featurecounts/
* - any BAM outputs (if added) can go to featurecounts/bam/
*/

publishDir "${params.outdir}/featurecounts",
mode: 'copy',
overwrite: true,
pattern: "*_featurecounts.tsv"

publishDir "${params.outdir}/featurecounts/bam",
mode: 'copy',
overwrite: true,
pattern: "*_annotated.bam",
enabled: params.annotate_feature_assignment

input:
tuple val(meta), path(mapped_reads), path(annotation)

output:
tuple val(meta), path(count_table), emit: sample_feature_counts
tuple val(meta), path(annotated_bam), optional: true, emit: annotated_bam

script:
output_stem = "${meta.ID}_REP${meta.REP}"
count_table = "${output_stem}_featurecounts.tsv"
annotated_bam = "${output_stem}_annotated.bam"

/*
* Decision on strandedness for featureCounts
* -s 0 : unstranded
* -s 1 : forward stranded 5' to 3'
* -s 2 : reversely stranded 3' to 5'
*/

switch (params.library_strandedness) {
case "none":
fc_strandedness = "0"
break
case "forward":
fc_strandedness = "1"
break
case "reverse":
fc_strandedness = "2"
break
default:
log.error "Unrecognised parameter value for strandedness: ${params.library_strandedness}"
}

if (params.annotate_feature_assignment) {
"""
featureCounts \\
-a ${annotation} \\
-o ${count_table} \\
-s ${fc_strandedness} \\
-T ${task.cpus} \\
${params.featurecounts_args} \\
${mapped_reads}
cp ${mapped_reads} ${annotated_bam}
"""
} else {
"""
featureCounts \\
-a ${annotation} \\
-o ${count_table} \\
-s ${fc_strandedness} \\
-T ${task.cpus} \\
${params.featurecounts_args} \\
${mapped_reads}
"""
}
}

process COMBINE_FEATURECOUNTS {
label 'cpu_1'
label 'time_1'
label 'mem_16'

publishDir "${params.outdir}/featurecounts", mode: 'copy', overwrite: true

container 'ubuntu:22.04'

input:
path(count_tables)

output:
path(counts_table), emit: all_feature_counts

script:
counts_table = "gene_counts.tsv"

"""
files=( *_featurecounts.tsv )
# Extract gene list from first count file (column 1)

if [ \${#files[@]} -eq 0 ]; then
echo "No *_featurecounts.tsv files found" >&2
exit 1
fi

# Extract sample names from filenames
samples=()
for f in "\${files[@]}"; do
# Remove path + suffix (_featurecounts.tsv)
base=\$(basename "\$f" | sed 's/_featurecounts.tsv//')
samples+=( "\$base" )
done

# Write output
printf "feature_id" > ${counts_table}

for s in "\${samples[@]}"; do
printf "\\t%s" "\$s" >> ${counts_table}
done
printf "\\n" >> ${counts_table}

# Build table
cut -f1 "\${files[0]}" | grep -v '^#' | tail -n +2 | while IFS= read -r gene; do
printf '%s' "\$gene"
for f in "\${files[@]}"; do
count=\$(awk -F'\t' -v g="\$gene" '\$1==g && NR>1 {print \$7; exit}' "\$f")
printf '\t%s' "\${count:-0}"
done
printf '\n'
done >> "${counts_table}"

"""
}

12 changes: 6 additions & 6 deletions modules/htseq.nf
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
process HTSEQ_COUNT {
tag "${meta.ID} : REP${meta.REP}"
label 'cpu_1'
label 'mem_100M'
label 'time_1'
label 'mem_16'

conda "bioconda::htseq=2.0.5"
container 'quay.io/biocontainers/htseq:2.0.5--py310h5aa3a86_0'
Expand All @@ -14,8 +14,8 @@ process HTSEQ_COUNT {
tuple val(meta), path(mapped_reads), path(annotation)

output:
tuple val(meta), path("${count_table}"), emit: sample_feature_counts
tuple val(meta), path("${annotated_bam}"), optional: true, emit: annotated_bam
tuple val(meta), path(count_table), emit: sample_feature_counts
tuple val(meta), path(annotated_bam), optional: true, emit: annotated_bam

script:
output_stem = "${meta.ID}_REP${meta.REP}"
Expand Down Expand Up @@ -59,18 +59,18 @@ process HTSEQ_COUNT {

process COMBINE_HTSEQ {
label 'cpu_1'
label 'mem_100M'
label 'time_1'
label 'mem_16'

publishDir "${params.outdir}/htseq", mode: 'copy', overwrite: true, pattern: "*_counts.tsv"
publishDir "${params.outdir}/htseq", mode: 'copy', overwrite: true

container 'ubuntu:22.04'

input:
path(count_tables)

output:
path("${counts_table}"), emit: all_feature_counts
path(counts_table), emit: all_feature_counts

script:
counts_table = "gene_counts.tsv"
Expand Down
13 changes: 10 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ params {
manifest = ""
reference = ""
annotation = ""
library_strandedness = ""
library_strandedness = "none"

// Output
outdir = "./results"
Expand Down Expand Up @@ -42,9 +42,16 @@ params {
min_mapping_quality = 2

// Counting
count_method = "htseq" // htseq (default) or featurecounts
htseq_args = "--type gene --idattr locus_tag --nonunique none --secondary-alignments ignore"
annotate_feature_assignment = false


// Default featureCounts arguments
// -p : paired-end data
// -B : require both mates properly aligned
// -C : do not count discordant mapping
featurecounts_args = "-p -B -C"

// Coverage
skip_strand_specific_analysis = false
coverage_window_size = 100
Expand Down Expand Up @@ -112,4 +119,4 @@ def load_nfcore_profile_config() {
System.err.println("Encountered the following exception:")
throw e
}
}
}
36 changes: 29 additions & 7 deletions subworkflows/count_reads.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ include {
COMBINE_HTSEQ
} from '../modules/htseq'

include {
FEATURECOUNTS_COUNT;
COMBINE_FEATURECOUNTS
} from '../modules/featurecounts'


workflow COUNT_READS {
take:
ch_reads_to_filter
Expand Down Expand Up @@ -36,15 +42,31 @@ workflow COUNT_READS {
// COUNTING
ch_filtered_reads
.combine(annotation)
.set {htseq_input}
HTSEQ_COUNT(htseq_input)
HTSEQ_COUNT.out.sample_feature_counts
.map { ID, count_table -> count_table }
.collect()
.set { ch_count_tables }
.set {ch_count_input}

if (params.count_method == "featurecounts") {

FEATURECOUNTS_COUNT(ch_count_input)
FEATURECOUNTS_COUNT.out.sample_feature_counts
.map { meta, count_table -> count_table }
.collect()
.set { ch_count_tables }

COMBINE_HTSEQ(ch_count_tables)
COMBINE_FEATURECOUNTS(ch_count_tables)

} else if (params.count_method == 'htseq') {

HTSEQ_COUNT(ch_count_input)
HTSEQ_COUNT.out.sample_feature_counts
.map { meta, count_table -> count_table }
.collect()
.set { ch_count_tables }

COMBINE_HTSEQ(ch_count_tables)
} else {
error "Unknown value for params.count_method: '${params.count_method}'. Use 'htseq' or 'featurecounts'."
}

emit:
ch_samtools_stats
}