diff --git a/config/config.yaml b/config/config.yaml index a80ab3b..34c19b4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -640,7 +640,7 @@ params: # Extra parameters for MarkDuplicates. # See https://gatk.broadinstitute.org/hc/en-us/articles/360057439771-MarkDuplicates-Picard - MarkDuplicates: "REMOVE_DUPLICATES=true" + MarkDuplicates: "--REMOVE_DUPLICATES true" # Run several Picard QC tools, as needed, using Picard CollectMultipleMetrics. # See https://gatk.broadinstitute.org/hc/en-us/articles/360042478112-CollectMultipleMetrics-Picard @@ -670,7 +670,7 @@ params: # or `-Djava.io.tmpdir=/path/to/tmp` to set a directory for temporary data, in case that the # system-provided tmp dir is too small (which can happen on clusters). # The last option, SortVcf-java-opts, is used by bcftools when using contig-group-size > 0. - MarkDuplicates-java-opts: "-Xmx10g" + MarkDuplicates-java-opts: "" CollectMultipleMetrics-java-opts: "" SortVcf-java-opts: "" diff --git a/workflow/Snakefile b/workflow/Snakefile index cc57931..bd3ef08 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,9 +21,13 @@ rule all: # Basic steps "mapping/final.done", "calling/genotyped-all.vcf.gz", + "calling/genotyped-all.vcf.gz.done", "calling/filtered-all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [], + "calling/filtered-all.vcf.gz.done" if not config["settings"]["filter-variants"] == "none" else [], "annotation/snpeff.vcf.gz" if config["settings"]["snpeff"] else [], + "annotation/snpeff.vcf.gz.done" if config["settings"]["snpeff"] else [], "annotation/vep.vcf.gz" if config["settings"]["vep"] else [], + "annotation/vep.vcf.gz.done" if config["settings"]["vep"] else [], # Quality control "qc/multiqc.html", # Reference genome statistics diff --git a/workflow/envs/picard.yaml b/workflow/envs/picard.yaml index 03ce377..3beaa0d 100644 --- a/workflow/envs/picard.yaml +++ b/workflow/envs/picard.yaml @@ -16,7 +16,7 @@ dependencies: - numpy ==2.0.0 # Tools - - picard ==3.2.0 + - picard ==3.3.0 # Need tidyverse because of issue #9 # We are now using the version that gets installed when specifying no version, diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 25f09fa..bc52c57 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -68,6 +68,11 @@ rule snpeff: if not config["settings"]["filter-variants"] == "none" else "calling/genotyped-all.vcf.gz" ), + done=( + "calling/filtered-all.vcf.gz.done" + if not config["settings"]["filter-variants"] == "none" + else "calling/genotyped-all.vcf.gz.done" + ), # path to reference db downloaded with the snpeff download wrapper above db=get_snpeff_db_path(), output: @@ -77,6 +82,7 @@ rule snpeff: stats=report("annotation/snpeff.html", category="Calls"), # summary statistics in CSV, optional csvstats="annotation/snpeff.csv", + done=touch("annotation/snpeff.vcf.gz.done") log: "logs/annotation/snpeff.log", group: @@ -191,6 +197,11 @@ rule vep: if not config["settings"]["filter-variants"] == "none" else "calling/genotyped-all.vcf.gz" ), + done=( + "calling/filtered-all.vcf.gz.done" + if not config["settings"]["filter-variants"] == "none" + else "calling/genotyped-all.vcf.gz.done" + ), cache=get_vep_cache_dir(), plugins=get_vep_plugins_dir(), output: @@ -209,6 +220,7 @@ rule vep: caption="../report/stats.rst", category="Calls", ), + done=touch("annotation/vep.vcf.gz.done") params: # Pass a list of plugins to use, # see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html diff --git a/workflow/rules/calling-bcftools-combined.smk b/workflow/rules/calling-bcftools-combined.smk index c56e251..74f772b 100644 --- a/workflow/rules/calling-bcftools-combined.smk +++ b/workflow/rules/calling-bcftools-combined.smk @@ -15,6 +15,7 @@ rule call_variants: # similar to what our GATK HaplotypeCaller rule does. samples=get_all_bams(), indices=get_all_bais(), + done=get_all_bams_done(), # If we use restricted regions, set them here. If not, empty, which will propagate to the # get_mpileup_params function as well. Same for small contig groups. # regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else [] @@ -34,7 +35,7 @@ rule call_variants: else temp("calling/called/{contig}.vcf.gz") ), # vcf=protected("calling/called/{contig}.vcf.gz") - done=touch("calling/called/{contig}.done"), + done=touch("calling/called/{contig}.vcf.gz.done"), params: # Optional parameters for bcftools mpileup (except -g, -f). mpileup=get_mpileup_params, @@ -86,8 +87,17 @@ def merge_variants_vcfs_input(wildcards): # Need index files for some of the downstream tools. # Rational for the fai: see merge_variants_vcfs_input() def merge_variants_tbis_input(wildcards): - fai = checkpoints.samtools_faidx.get().output[0] - return expand("calling/called/{contig}.vcf.gz.tbi", contig=get_contigs(fai)) + files = merge_variants_vcfs_input(wildcards) + return [f + ".tbi" for f in files] + # fai = checkpoints.samtools_faidx.get().output[0] + # return expand("calling/called/{contig}.vcf.gz.tbi", contig=get_contigs(fai)) + + +# Lastly, we want to make sure that all inputs are actually done, +# as snakemake can mess this up still with slurm etc... :-( +def merge_variants_done_input(wildcards): + files = merge_variants_vcfs_input(wildcards) + return [f + ".done" for f in files] # bcftools does not automatically create vcf index files, so we need a rule for that... @@ -159,6 +169,7 @@ rule merge_variants: contig_groups=contigs_groups_input, vcfs=merge_variants_vcfs_input, tbis=merge_variants_tbis_input, + done=merge_variants_done_input, output: # With small contigs, we also need sorting, see below. # Unfortunately, we cannot pipe here, as Picard fails with that, so temp file it is... @@ -169,9 +180,9 @@ rule merge_variants: else "calling/genotyped-all.vcf.gz" ), done=( - touch("calling/merged/merged-all.done") + touch("calling/merged/merged-all.vcf.gz.done") if (config["settings"].get("contig-group-size")) - else touch("calling/genotyped-all.done") + else touch("calling/genotyped-all.vcf.gz.done") ), log: "logs/calling/vcflib/merge-genotyped.log", diff --git a/workflow/rules/calling-bcftools-individual.smk b/workflow/rules/calling-bcftools-individual.smk index 5e7605b..afe50c0 100644 --- a/workflow/rules/calling-bcftools-individual.smk +++ b/workflow/rules/calling-bcftools-individual.smk @@ -14,6 +14,7 @@ rule call_variants: # Get the sample data. samples=get_sample_bams_wildcards, indices=get_sample_bais_wildcards, + done=get_sample_bams_wildcards_done, # If we use restricted regions, set them here. If not, empty, which will propagate to the # get_mpileup_params function as well. Same for small contig groups. # regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else [] @@ -33,7 +34,7 @@ rule call_variants: else temp("calling/called/{sample}.{contig}.g.vcf.gz") ), gtbi="calling/called/{sample}.{contig}.g.vcf.gz.tbi", - done=touch("calling/called/{sample}.{contig}.g.done"), + done=touch("calling/called/{sample}.{contig}.g.vcf.gz.done"), params: # Optional parameters for bcftools mpileup (except -g, -f). mpileup=get_mpileup_params, @@ -54,8 +55,6 @@ rule call_variants: "(" "bcftools mpileup " " {params.mpileup} --fasta-ref {input.ref} --output-type u {input.samples} " - - " -a 'INFO/AD' -a 'FORMAT/AD' -a 'FORMAT/DP' --gvcf 0 | " "bcftools call " " --threads {threads} {params.call} --gvcf 0 --output-type u | " @@ -90,11 +89,15 @@ rule combine_contig: "calling/called/{sample}.{{contig}}.g.vcf.gz.tbi", sample=config["global"]["sample-names"], ), + done=expand( + "calling/called/{sample}.{{contig}}.g.vcf.gz.done", + sample=config["global"]["sample-names"], + ), output: gvcf="calling/combined/all.{contig}.g.vcf.gz", gtbi="calling/combined/all.{contig}.g.vcf.gz.tbi", gvcflist="calling/combined/all.{contig}.g.txt", - done=touch("calling/combined/all.{contig}.g.done"), + done=touch("calling/combined/all.{contig}.g.vcf.gz.done"), log: "logs/calling/bcftools/combine-contig-{contig}.log", benchmark: @@ -131,6 +134,12 @@ def combined_contig_gvcfs(wildcards): return expand("calling/combined/all.{contig}.g.vcf.gz", contig=get_contigs(fai)) +# Also need the done files to make sure snakemake doesn't mess this up. +def combined_contig_done(wildcards): + fai = checkpoints.samtools_faidx.get().output[0] + return expand("calling/called/all.{contig}.g.vcf.gz.done", contig=get_contigs(fai)) + + # We also need a comma-separated list of the contigs, so that bcftools can output # the concatenated entries in the correct order as given in the fai. # For this, we use the same technique of using the fai checkpoint as before. @@ -154,6 +163,7 @@ rule combine_all: ref=get_fai, contig_groups=contigs_groups_input, gvcfs=combined_contig_gvcfs, + dones=combined_contig_done, output: # vcf="calling/genotyped-all.vcf.gz", # tbi="calling/genotyped/all.vcf.gz.tbi", @@ -175,9 +185,9 @@ rule combine_all: else "calling/genotyped-all.txt" ), done=( - touch("calling/merged/merged-all.done") + touch("calling/merged/merged-all.vcf.gz.done") if (config["settings"].get("contig-group-size")) - else touch("calling/genotyped-all.done") + else touch("calling/genotyped-all.vcf.gz.done") ), params: # Use a list of the chromosomes in the same order as the fai, for bcftools to sort the output. diff --git a/workflow/rules/calling-bcftools.smk b/workflow/rules/calling-bcftools.smk index 8f2d893..3646c2f 100644 --- a/workflow/rules/calling-bcftools.smk +++ b/workflow/rules/calling-bcftools.smk @@ -64,10 +64,11 @@ if config["settings"].get("contig-group-size"): rule sort_variants: input: vcf="calling/merged/merged-all.vcf.gz", + done="calling/merged/merged-all.vcf.gz.done", refdict=genome_dict(), output: vcf="calling/genotyped-all.vcf.gz", - done=touch("calling/genotyped-all.done"), + done=touch("calling/genotyped-all.vcf.gz.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. extra=( diff --git a/workflow/rules/calling-freebayes.smk b/workflow/rules/calling-freebayes.smk index f0b1f38..3852c5f 100644 --- a/workflow/rules/calling-freebayes.smk +++ b/workflow/rules/calling-freebayes.smk @@ -38,6 +38,7 @@ rule call_variants: # Without bai files, freebayes claims that it recomputes them, but actually crashes... samples=get_all_bams(), indices=get_all_bais(), + dones=get_all_bams_done(), # If known variants are set, use this as input to ensure the file exists. # We use the file via the know_variants_extra() function call below, # but request it here as well to ensure that it and its index are present. @@ -58,7 +59,7 @@ rule call_variants: ), output: pipe("calling/called/{contig}.vcf"), - touch("calling/called/{contig}.vcf.done"), + # touch("calling/called/{contig}.vcf.done"), log: "logs/calling/freebayes/{contig}.log", benchmark: @@ -117,6 +118,12 @@ def merge_variants_vcfs_input(wildcards): return expand("calling/called/{contig}.vcf.gz", contig=get_contigs(fai)) +# Need to work around snakemake issues by requiring the done files +def merge_variants_vcfs_input_done(wildcards): + fai = checkpoints.samtools_faidx.get().output[0] + return expand("calling/called/{contig}.vcf.gz.done", contig=get_contigs(fai)) + + rule merge_variants: input: # fai is needed to calculate aggregation over contigs below. @@ -128,9 +135,10 @@ rule merge_variants: # The wrapper expects input to be called `vcfs`, but we can use `vcf.gz` as well. # vcfs=lambda w: expand("calling/called/{contig}.vcf.gz", contig=get_contigs()) vcfs=merge_variants_vcfs_input, + done=merge_variants_vcfs_input_done, output: vcf="calling/genotyped-all.vcf.gz", - done=touch("calling/genotyped-all.done"), + done=touch("calling/genotyped-all.vcf.gz.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. extra=( diff --git a/workflow/rules/calling-haplotypecaller.smk b/workflow/rules/calling-haplotypecaller.smk index 93365af..e06eadb 100644 --- a/workflow/rules/calling-haplotypecaller.smk +++ b/workflow/rules/calling-haplotypecaller.smk @@ -25,6 +25,7 @@ rule call_variants: # Get the sample data. bam=get_sample_bams_wildcards, bai=get_sample_bais_wildcards, + done=get_sample_bams_wildcards_done, # Get the reference genome, as well as its indices. ref=config["data"]["reference-genome"], refidcs=expand( @@ -60,7 +61,7 @@ rule call_variants: if config["settings"]["keep-intermediate"]["calling"] else temp("calling/called/{sample}.{contig}.g.vcf.gz.tbi") ), - done=touch("calling/called/{sample}.{contig}.g.done"), + done=touch("calling/called/{sample}.{contig}.g.vcf.gz.done"), log: "logs/calling/gatk-haplotypecaller/{sample}.{contig}.log", benchmark: @@ -135,13 +136,17 @@ rule combine_calls: "calling/called/{sample}.{{contig}}.g.vcf.gz.tbi", sample=config["global"]["sample-names"], ), + done=expand( + "calling/called/{sample}.{{contig}}.g.vcf.gz.done", + sample=config["global"]["sample-names"], + ), output: gvcf=( "calling/combined/all.{contig}.g.vcf.gz" if config["settings"]["keep-intermediate"]["calling"] else temp("calling/combined/all.{contig}.g.vcf.gz") ), - done=touch("calling/combined/all.{contig}.g.done"), + done=touch("calling/combined/all.{contig}.g.vcf.gz.done"), params: extra=config["params"]["gatk"]["CombineGVCFs-extra"] + ( @@ -173,13 +178,14 @@ rule genotype_variants: ), refdict=genome_dict(), gvcf="calling/combined/all.{contig}.g.vcf.gz", + done="calling/combined/all.{contig}.g.vcf.gz.done", output: vcf=( "calling/genotyped/all.{contig}.vcf.gz" if config["settings"]["keep-intermediate"]["calling"] else temp("calling/genotyped/all.{contig}.vcf.gz") ), - done=touch("calling/genotyped/all.{contig}.done"), + done=touch("calling/genotyped/all.{contig}.vcf.gz.done"), params: extra=config["params"]["gatk"]["GenotypeGVCFs-extra"] + ( @@ -211,6 +217,12 @@ def merge_variants_vcfs_input(wildcards): return expand("calling/genotyped/all.{contig}.vcf.gz", contig=get_contigs(fai)) +# Also need to trick snakemake into completing all files... +def merge_variants_vcfs_input_done(wildcards): + fai = checkpoints.samtools_faidx.get().output[0] + return expand("calling/genotyped/all.{contig}.vcf.gz.done", contig=get_contigs(fai)) + + rule merge_variants: input: # fai is needed to calculate aggregation over contigs below. @@ -221,9 +233,10 @@ rule merge_variants: contig_groups=contigs_groups_input, # vcfs=lambda w: expand("calling/genotyped/all.{contig}.vcf.gz", contig=get_contigs()) vcfs=merge_variants_vcfs_input, + done=merge_variants_vcfs_input_done, output: vcf="calling/genotyped-all.vcf.gz", - done=touch("calling/genotyped-all.done"), + done=touch("calling/genotyped-all.vcf.gz.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. extra=( diff --git a/workflow/rules/damage.smk b/workflow/rules/damage.smk index 9e8242b..48a0ae8 100644 --- a/workflow/rules/damage.smk +++ b/workflow/rules/damage.smk @@ -10,6 +10,7 @@ rule mapdamage: # Get either the normal mapping output, or, if additional filtering via `samtools view` # is set in the config settings: filter-mapped-reads, use the filtered output instead. get_mapped_reads, + get_mapped_reads_done, # "mapping/sorted/{sample}-{unit}.bam" # Get the reference genome and its indices. Not sure if the indices are needed # for this particular rule, but doesn't hurt to include them as an input anyway. @@ -60,6 +61,7 @@ rule damageprofiler: # Get either the normal mapping output, or, if additional filtering via `samtools view` # is set in the config settings: filter-mapped-reads, use the filtered output instead. get_mapped_reads, + get_mapped_reads_done, # "mapping/sorted/{sample}-{unit}.bam" # Get the reference genome and its indices. Not sure if the indices are needed # for this particular rule, but doesn't hurt to include them as an input anyway. diff --git a/workflow/rules/duplicates-dedup.smk b/workflow/rules/duplicates-dedup.smk index a18882f..1649576 100644 --- a/workflow/rules/duplicates-dedup.smk +++ b/workflow/rules/duplicates-dedup.smk @@ -28,6 +28,7 @@ rule mark_duplicates: # or the clipped ones, if those were requested. # We always use the merged units per sample here. get_mapped_reads, + get_mapped_reads_done, output: bam=temp("mapping/dedup/{sample}_rmdup.bam"), metrics="mapping/dedup/{sample}.dedup.json", @@ -46,7 +47,7 @@ rule mark_duplicates: shell: # Dedup bases its output names on this, so we need some more trickery to make it # output the file names that we want. - "dedup -i {input} -o {params.out_dir} {params.extra} > {log} 2>&1 ; " + "dedup -i {input[0]} -o {params.out_dir} {params.extra} > {log} 2>&1 ; " # "mv" # " dedup/{wildcards.sample}." + get_mapped_read_infix() + "_rmdup.bam" # " dedup/{wildcards.sample}_rmdup.bam ; " @@ -58,13 +59,14 @@ rule mark_duplicates: rule sort_reads_dedup: input: "mapping/dedup/{sample}_rmdup.bam", + "mapping/dedup/{sample}.dedup.done", output: ( "mapping/dedup/{sample}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/dedup/{sample}.bam") ), - touch("mapping/dedup/{sample}.done"), + touch("mapping/dedup/{sample}.bam.done"), params: extra=config["params"]["samtools"]["sort"], tmp_dir=config["params"]["samtools"]["temp-dir"], diff --git a/workflow/rules/duplicates-picard.smk b/workflow/rules/duplicates-picard.smk index 958fbff..e6346c6 100644 --- a/workflow/rules/duplicates-picard.smk +++ b/workflow/rules/duplicates-picard.smk @@ -13,7 +13,8 @@ rule mark_duplicates: # is set in the config settings: filter-mapped-reads, use the filtered output instead, # or the clipped ones, if those were requested. # We always use the merged units per sample here. - get_mapped_reads, + bams=get_mapped_reads, + done=get_mapped_reads_done, output: bam=( "mapping/dedup/{sample}.bam" @@ -21,7 +22,7 @@ rule mark_duplicates: else temp("mapping/dedup/{sample}.bam") ), metrics="qc/dedup/{sample}.metrics.txt", - done=touch("mapping/dedup/{sample}.done"), + done=touch("mapping/dedup/{sample}.bam.done"), log: "logs/mapping/picard-markduplicates/{sample}.log", benchmark: @@ -32,13 +33,15 @@ rule mark_duplicates: # and some system libraries, leading the JRE to exit with fatal error SIGSEGV caused by # libgkl_compression, see https://github.com/broadinstitute/picard/issues/1329. # Hence, on MacOS, we add the two settings recommended by the github issue. - config["params"]["picard"]["MarkDuplicates-java-opts"] - + " " - + config["params"]["picard"]["MarkDuplicates"] + extra=config["params"]["picard"]["MarkDuplicates"] + (" USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else ""), + java_opts=config["params"]["picard"]["MarkDuplicates-java-opts"] + # resources: + # mem_mb=1024, group: "mapping_extra" conda: "../envs/picard.yaml" wrapper: - "0.51.3/bio/picard/markduplicates" + "v5.7.0/bio/picard/markduplicates" + # "0.51.3/bio/picard/markduplicates" diff --git a/workflow/rules/filtering-bcftools-filter.smk b/workflow/rules/filtering-bcftools-filter.smk index 6133d9c..de914c5 100644 --- a/workflow/rules/filtering-bcftools-filter.smk +++ b/workflow/rules/filtering-bcftools-filter.smk @@ -11,13 +11,14 @@ def get_filter(wildcards): rule bcftools_filter_calls: input: vcf="calling/filtered/all.{vartype}.selected.vcf.gz", + don="calling/filtered/all.{vartype}.selected.vcf.gz.done", output: vcf=( "calling/filtered/all.{vartype}.filtered.vcf.gz" if config["settings"]["keep-intermediate"]["filtering"] else temp("calling/filtered/all.{vartype}.filtered.vcf.gz") ), - done=touch("calling/filtered/all.{vartype}.filtered.done"), + done=touch("calling/filtered/all.{vartype}.filtered.vcf.gz.done"), params: tool=config["params"]["bcftools-filter"]["tool"], filters=get_filter, diff --git a/workflow/rules/filtering-gatk-variantfiltration.smk b/workflow/rules/filtering-gatk-variantfiltration.smk index ef2887d..47d8bbc 100644 --- a/workflow/rules/filtering-gatk-variantfiltration.smk +++ b/workflow/rules/filtering-gatk-variantfiltration.smk @@ -16,6 +16,7 @@ rule gatk_hard_filter_calls: input: ref=config["data"]["reference-genome"], vcf="calling/filtered/all.{vartype}.selected.vcf.gz", + don="calling/filtered/all.{vartype}.selected.vcf.gz.done", refdict=genome_dict(), output: vcf=( @@ -23,7 +24,7 @@ rule gatk_hard_filter_calls: if config["settings"]["keep-intermediate"]["filtering"] else temp("calling/filtered/all.{vartype}.filtered.vcf.gz") ), - done=touch("calling/filtered/all.{vartype}.filtered.done"), + done=touch("calling/filtered/all.{vartype}.filtered.vcf.gz.done"), params: filters=get_filter, extra=config["params"]["gatk-variantfiltration"]["extra"], diff --git a/workflow/rules/filtering-gatk-vqsr.smk b/workflow/rules/filtering-gatk-vqsr.smk index bd5853e..42cc60e 100644 --- a/workflow/rules/filtering-gatk-vqsr.smk +++ b/workflow/rules/filtering-gatk-vqsr.smk @@ -76,6 +76,7 @@ rule gatk_variant_recalibrator: **get_variant_recalibrator_resource_files(), ref=config["data"]["reference-genome"], vcf="calling/filtered/all.{vartype}.selected.vcf.gz", + done="calling/filtered/all.{vartype}.selected.vcf.gz.done", refdict=genome_dict(), # Resources have to be given as named input files, we unpack the dict to get them. # We also request the tbi index files, so that users get a nice error message if missing. @@ -88,7 +89,7 @@ rule gatk_variant_recalibrator: # We also might produce a plot PDF about the trances - but only for the SNPs, # not for the INDELs, so we do not specify it here for simplicity... # The avid user will find it in the `filtered` directory either way. - done=touch("calling/filtered/all.{vartype}.vqsr-recal.done"), + done=touch("calling/filtered/all.{vartype}.vqsr-recal.vcf.gz.done"), params: # set mode, must be either SNP, INDEL or BOTH mode="{vartype}", @@ -127,15 +128,17 @@ rule gatk_apply_vqsr: ref=config["data"]["reference-genome"], refdict=genome_dict(), vcf="calling/filtered/all.{vartype}.selected.vcf.gz", + vcf_done="calling/filtered/all.{vartype}.selected.vcf.gz.done", recal="calling/filtered/all.{vartype}.vqsr-recal.vcf.gz", tranches="calling/filtered/all.{vartype}.vqsr-recal.tranches", + recal_done="calling/filtered/all.{vartype}.vqsr-recal.vcf.gz.done", output: vcf=( "calling/filtered/all.{vartype}.recalibrated.vcf.gz" if config["settings"]["keep-intermediate"]["filtering"] else temp("calling/filtered/all.{vartype}.recalibrated.vcf.gz") ), - done=touch("calling/filtered/all.{vartype}.recalibrated.done"), + done=touch("calling/filtered/all.{vartype}.recalibrated.vcf.gz.done"), log: "logs/calling/gatk-applyvqsr/{vartype}.log", benchmark: diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index 68d6b71..81b0c51 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -10,6 +10,7 @@ rule select_calls: input: ref=config["data"]["reference-genome"], vcf="calling/genotyped-all.vcf.gz", + done="calling/genotyped-all.vcf.gz.done", refdict=genome_dict(), # bcftools does not automatically create vcf index files, so we need to specifically request them... # ... but the picard merge tool that we use right now does create tbi files, so all good atm. @@ -20,7 +21,7 @@ rule select_calls: if config["settings"]["keep-intermediate"]["filtering"] else temp("calling/filtered/all.{vartype}.selected.vcf.gz") ), - done=touch("calling/filtered/all.{vartype}.selected.done"), + done=touch("calling/filtered/all.{vartype}.selected.vcf.gz.done"), params: extra="--select-type-to-include {vartype}", log: @@ -92,10 +93,19 @@ rule merge_calls: else "filtered" ), ), + done=expand( + "calling/filtered/all.{vartype}.{filtertype}.vcf.gz.done", + vartype=["SNP", "INDEL"], + filtertype=( + "recalibrated" + if config["settings"]["filter-variants"] == "gatk-vqsr" + else "filtered" + ), + ), output: vcf="calling/filtered-all.vcf.gz", # vcf=protected("calling/filtered-all.vcf.gz") - done=touch("calling/filtered/all.done"), + done=touch("calling/filtered-all.vcf.gz.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. extra=( diff --git a/workflow/rules/frequency.smk b/workflow/rules/frequency.smk index dd43e9c..fffbe0a 100644 --- a/workflow/rules/frequency.smk +++ b/workflow/rules/frequency.smk @@ -245,6 +245,7 @@ if impmethod in ["simpute", "npute"]: rule hafpipe_impute_snp_table: input: snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", + done=get_hafpipe_snp_table_dir() + "/{chrom}.done", bins=get_hafpipe_bins(), output: csv=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, @@ -276,6 +277,7 @@ elif impmethod != "": rule hafpipe_impute_snp_table: input: snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", + done=get_hafpipe_snp_table_dir() + "/{chrom}.done", output: csv=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, done=touch(get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done"), @@ -316,6 +318,7 @@ if impmethod == "": snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", alleleCts=get_hafpipe_snp_table_dir() + "/{chrom}.csv.alleleCts", numeric=get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric.bgz", + done=get_hafpipe_snp_table_dir() + "/{chrom}.done", output: flag=get_hafpipe_snp_table_dir() + "/{chrom}.csv.flag", shell: @@ -331,6 +334,7 @@ else: rule hafpipe_snp_table_indices: input: snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, + done=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done", bins=get_hafpipe_bins(), # require that HAF-pipe scripts are there output: alleleCts=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".alleleCts", @@ -399,6 +403,7 @@ rule hafpipe_sample_bams: input: bam=get_sample_bams_wildcards, # provided in mapping.smk bai=get_sample_bais_wildcards, # provided in mapping.smk + done=get_sample_bams_wildcards_done, output: bam="hafpipe/bam/{sample}.bam", bai="hafpipe/bam/{sample}.bam.bai", @@ -466,6 +471,7 @@ rule hafpipe_allele_frequencies: snptable=get_hafpipe_snp_table, # provided above flag=get_hafpipe_snp_table_flag, freqs="hafpipe/frequencies/{sample}.bam.{chrom}.freqs", # from Task 3 above + dones="hafpipe/frequencies/{sample}.bam.{chrom}.freqs.done", bins=get_hafpipe_bins(), output: # Same as above: just expect the file name as produced by HAFpipe. @@ -500,9 +506,10 @@ def collect_sample_hafpipe_allele_frequencies(wildcards): # Snakemake then needs an input function to work with the fai checkpoint here. fai = checkpoints.samtools_faidx.get().output[0] return expand( - "hafpipe/frequencies/{sample}.bam.{chrom}.afSite", + "hafpipe/frequencies/{sample}.bam.{chrom}.afSite{ext}", sample=wildcards.sample, chrom=get_hafpipe_chromosomes(fai), + ext=["", ".done"], ) @@ -536,6 +543,7 @@ rule hafpipe_collect_concat_samples: + (".gz" if config["params"]["hafpipe"].get("compress-sample-tables", False) else ""), sample=config["global"]["sample-names"], ), + done="hafpipe/samples/{sample}.done", output: done=touch("hafpipe/samples.done"), @@ -557,9 +565,10 @@ def collect_all_hafpipe_allele_frequencies(wildcards): # Snakemake then needs an input function to work with the fai checkpoint here. fai = checkpoints.samtools_faidx.get().output[0] return expand( - "hafpipe/frequencies/{sample}.bam.{chrom}.afSite", + "hafpipe/frequencies/{sample}.bam.{chrom}.afSite{ext}", sample=config["global"]["sample-names"], chrom=get_hafpipe_chromosomes(fai), + ext=["", ".done"], ) @@ -627,8 +636,11 @@ localrules: rule all_hafpipe: input: "hafpipe/afSite.done", + [ "hafpipe/all.csv" - + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "") + + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else ""), + "hafpipe/all.done" + ] if config["params"]["hafpipe"].get("make-merged-table", False) else [], "hafpipe/samples.done" diff --git a/workflow/rules/initialize.smk b/workflow/rules/initialize.smk index a173d39..fa1e216 100644 --- a/workflow/rules/initialize.smk +++ b/workflow/rules/initialize.smk @@ -8,6 +8,7 @@ import socket, platform import subprocess from datetime import datetime import logging +from pathlib import Path from snakemake_interface_executor_plugins.settings import ExecMode diff --git a/workflow/rules/mapping-bowtie2.smk b/workflow/rules/mapping-bowtie2.smk index 9dbb8e3..9638b8f 100644 --- a/workflow/rules/mapping-bowtie2.smk +++ b/workflow/rules/mapping-bowtie2.smk @@ -17,6 +17,7 @@ rule bowtie2_index: config["data"]["reference-genome"] + ".{ext}", ext=["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"], ), + done=touch(config["data"]["reference-genome"] + ".done") params: # Bowtie expects the prefix, and creates the above output files automatically. # So, let's do some snakemake magic to make this work. @@ -30,11 +31,6 @@ rule bowtie2_index: "bowtie2-build {input[0]} {params.basename} > {log} 2>&1" -# Rule is not submitted as a job to the cluster. -# localrules: -# bowtie2_index, - - # ================================================================================================= # Read Mapping # ================================================================================================= @@ -56,13 +52,14 @@ def get_bowtie2_extra(wildcards): rule map_reads: input: sample=get_trimmed_reads, + sample_done=get_trimmed_reads_done, indices=expand( config["data"]["reference-genome"] + ".{ext}", - ext=["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"], + ext=["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2", "done"], ), output: pipe("mapping/mapped/{sample}-{unit}.bam"), - touch("mapping/mapped/{sample}-{unit}.done"), + # touch("mapping/mapped/{sample}-{unit}.bam.done"), params: # Prefix of reference genome index (built with bowtie2-build above) index=config["data"]["reference-genome"], @@ -92,13 +89,14 @@ rule map_reads: rule sort_reads: input: "mapping/mapped/{sample}-{unit}.bam", + "mapping/mapped/{sample}-{unit}.bam.done", output: ( "mapping/sorted/{sample}-{unit}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/sorted/{sample}-{unit}.bam") ), - touch("mapping/sorted/{sample}-{unit}.done"), + touch("mapping/sorted/{sample}-{unit}.bam.done"), params: extra=config["params"]["samtools"]["sort"], tmp_dir=config["params"]["samtools"]["temp-dir"], diff --git a/workflow/rules/mapping-bwa-aln.smk b/workflow/rules/mapping-bwa-aln.smk index 156afb3..e7b44a0 100644 --- a/workflow/rules/mapping-bwa-aln.smk +++ b/workflow/rules/mapping-bwa-aln.smk @@ -36,13 +36,14 @@ rule map_reads: config["data"]["reference-genome"] + ".{ext}", ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), + reads_done=get_trimmed_reads_done, output: ( "mapping/sai/{sample}-{unit}-{pair}.sai" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/sai/{sample}-{unit}-{pair}.sai") ), - touch("mapping/sai/{sample}-{unit}-{pair}.done"), + touch("mapping/sai/{sample}-{unit}-{pair}.sai.done"), # Absolutely no idea why the following does not work. Snakemake complains that the pipe # is not used at all, which is simply wrong, as it is clearly used by bwa_sai_to_bam, # and also why would this rule here be called if its output file was not requested at all?! @@ -102,6 +103,8 @@ def get_sai(wildcards): pair=[1, 2], ) +def get_sai_done(wildcards): + return get_sai(wildcards) + ".done" def get_bwa_aln_extra(wildcards): # We need the read group tags, including `ID` and `SM`, as downstream tools use these. @@ -116,6 +119,7 @@ rule bwa_sai_to_bam: input: fastq=get_trimmed_reads, sai=get_sai, + done=get_sai_done ref=config["data"]["reference-genome"], # Somehow, the wrapper expects the index extensions to be given, # instead of the underlying fasta file... Well, so let's do that. @@ -127,7 +131,7 @@ rule bwa_sai_to_bam: ), output: pipe("mapping/sorted/{sample}-{unit}-unclean.bam"), - touch("mapping/sorted/{sample}-{unit}-unclean.done"), + # touch("mapping/sorted/{sample}-{unit}-unclean.bam.done"), params: extra=get_bwa_aln_extra, # Sort as we need it. @@ -167,7 +171,7 @@ rule bwa_bam_clean: if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/sorted/{sample}-{unit}.bam") ), - touch("mapping/sorted/{sample}-{unit}.done"), + touch("mapping/sorted/{sample}-{unit}.bam.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. extra=( diff --git a/workflow/rules/mapping-bwa-mem.smk b/workflow/rules/mapping-bwa-mem.smk index 922a124..4fe6d28 100644 --- a/workflow/rules/mapping-bwa-mem.smk +++ b/workflow/rules/mapping-bwa-mem.smk @@ -16,6 +16,7 @@ def get_bwa_mem_extra(wildcards): rule map_reads: input: reads=get_trimmed_reads, + reads_done=get_trimmed_reads_done, ref=config["data"]["reference-genome"], idx=expand( config["data"]["reference-genome"] + ".{ext}", @@ -27,7 +28,7 @@ rule map_reads: if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/sorted/{sample}-{unit}.bam") ), - touch("mapping/sorted/{sample}-{unit}.done"), + touch("mapping/sorted/{sample}-{unit}.bam.done"), params: index=config["data"]["reference-genome"], extra=get_bwa_mem_extra, diff --git a/workflow/rules/mapping-bwa-mem2.smk b/workflow/rules/mapping-bwa-mem2.smk index efcdcfe..c43dffd 100644 --- a/workflow/rules/mapping-bwa-mem2.smk +++ b/workflow/rules/mapping-bwa-mem2.smk @@ -16,6 +16,7 @@ def get_bwa_mem2_extra(wildcards): rule map_reads: input: reads=get_trimmed_reads, + reads_done=get_trimmed_reads_done, ref=config["data"]["reference-genome"], # Somehow, the wrapper expects the index extensions to be given, # instead of the underlying fasta file... Well, so let's do that. @@ -31,7 +32,7 @@ rule map_reads: if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/sorted/{sample}-{unit}.bam") ), - touch("mapping/sorted/{sample}-{unit}.done"), + touch("mapping/sorted/{sample}-{unit}.bam.done"), params: extra=get_bwa_mem2_extra, # Sort as we need it. diff --git a/workflow/rules/mapping-recalibrate.smk b/workflow/rules/mapping-recalibrate.smk index a16f441..d6649ee 100644 --- a/workflow/rules/mapping-recalibrate.smk +++ b/workflow/rules/mapping-recalibrate.smk @@ -41,7 +41,7 @@ if config["settings"]["recalibrate-base-qualities"]: ) -def get_recal_input(bai=False): +def get_recal_input(ext=""): # case 1: no duplicate removal f = "mapping/merged/{sample}.bam" @@ -57,7 +57,7 @@ def get_recal_input(bai=False): if config["settings"]["remove-duplicates"]: f = "mapping/dedup/{sample}.bam" - if bai: + if ext == ".bai": if config["settings"].get("restrict-regions"): # case 5: need an index because random access is required f += ".bai" @@ -66,13 +66,14 @@ def get_recal_input(bai=False): # case 6: no index needed return [] else: - return f + return f + ext rule recalibrate_base_qualities: input: bam=get_recal_input(), - bai=get_recal_input(bai=True), + bai=get_recal_input(ext=".bai"), + done=get_recal_input(ext=".done"), ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", @@ -86,7 +87,7 @@ rule recalibrate_base_qualities: if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/recal/{sample}.bam") ), - done=touch("mapping/recal/{sample}.done"), + done=touch("mapping/recal/{sample}.bam.done"), # bam=protected("mapping/recal/{sample}.bam") params: extra=get_gatk_regions_param() + " " + config["params"]["gatk"]["BaseRecalibrator"], diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 97d0d01..272b30d 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -128,6 +128,21 @@ def get_sorted_sample_bams(wildcards): return expand("mapping/sorted/{{sample}}-{unit}.bam", unit=get_sample_units(wildcards.sample)) +# Also need the done files, with wildcards, so wrap it! +def get_sorted_sample_bams_done(wildcards): + bams = get_sorted_sample_bams(wildcards) + return [b + ".done" for b in bams] + + +def get_all_sorted_sample_bams(): + res = list() + for sample in config["global"]["sample-names"]: + for unit in get_sample_units(sample): + bam = f"mapping/sorted/{sample}-{unit}.bam" + res.append(bam) + return res + + # This is where all units are merged together. # We changed this behaviour. grenepipe v0.11.1 and before did _all_ the steps in this file # individually per unit. After that, we changed to merge here, and then run every subsequent @@ -137,9 +152,14 @@ def get_sorted_sample_bams(wildcards): rule merge_sample_unit_bams: input: get_sorted_sample_bams, + # We cannot request the done files here, as the wrapper + # simply wants to use them as bam files for merging as well... + # So instead, in order to ensure they are requested somewhere, + # we do that instead below in the all_bams rule. + # get_sorted_sample_bams_done, output: "mapping/merged/{sample}.bam", - touch("mapping/merged/{sample}.done"), + touch("mapping/merged/{sample}.bam.done"), params: extra=config["params"]["samtools"]["merge"], threads: config["params"]["samtools"]["merge-threads"] @@ -161,13 +181,14 @@ rule merge_sample_unit_bams: rule filter_mapped_reads: input: "mapping/merged/{sample}.bam", + "mapping/merged/{sample}.bam.done", output: ( "mapping/filtered/{sample}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/filtered/{sample}.bam") ), - touch("mapping/filtered/{sample}.done"), + touch("mapping/filtered/{sample}.bam.done"), params: extra=config["params"]["samtools"]["view"] + " -b", log: @@ -185,13 +206,16 @@ rule clip_read_overlaps: "mapping/filtered/{sample}.bam" if (config["settings"]["filter-mapped-reads"]) else ("mapping/merged/{sample}.bam"), + "mapping/filtered/{sample}.bam.done" + if (config["settings"]["filter-mapped-reads"]) + else ("mapping/merged/{sample}.bam.done"), output: ( "mapping/clipped/{sample}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapping/clipped/{sample}.bam") ), - touch("mapping/clipped/{sample}.done"), + touch("mapping/clipped/{sample}.bam.done"), params: extra=config["params"]["bamutil"]["extra"], log: @@ -223,6 +247,12 @@ def get_mapped_reads(wildcards): return result +# Need an extra function for this, as this is called with wildcards in the rule, +# so we cannot provide it with extra arguments there, and hence have to do this here. +def get_mapped_reads_done(wildcards): + return get_mapped_reads(wildcards) + ".done" + + # We need to get a bit dirty here, as dedup names output files based on input file names, # so that depending on what kind of input (mapped, filtered, clipped) we give it, # we get differently named output files, which does not work well with snakemake. @@ -291,15 +321,37 @@ if config["settings"]["recalibrate-base-qualities"]: # ================================================================================================= -# Final Mapping Result +# External Mapping # ================================================================================================= # Helper function for the case that a `mappings-table` is provided, in which case we do not run -# any mapping ourselves, and skipp all of the above. Instead, we then simply want to use the +# any mapping ourselves, and skip all of the above. Instead, we then simply want to use the # sample bam file from the table here. +# However, in our current setup, we require "done" files to trick snakemake into working properly. +# These might not exist when running with bam files from the outside, so we touch them here. def get_bam_from_mappings_table(sample): - return config["global"]["samples"].loc[sample, ["bam"]].dropna() + assert "mappings-table" in config["data"] and config["data"]["mappings-table"] + bams = config["global"]["samples"].loc[sample, ["bam"]].dropna() + + # Check if we have touched the bam done files already + if not hasattr(get_bam_from_mappings_table, "done"): + get_bam_from_mappings_table.done = False + + # If not, touch all files, then set the internal flag + # so that we do not do this every time this function is called. + if not get_bam_from_mappings_table.done: + for f in bams: + Path(f + ".done").touch() + get_bam_from_mappings_table.done = True + + # Now we can return the bam file list to the caller. + return bams + + +# ================================================================================================= +# Final Mapping Result +# ================================================================================================= # At this point, we have several choices of which files we want to hand down to the next @@ -310,7 +362,7 @@ def get_bam_from_mappings_table(sample): # (see above), what we get here is the file name of the last step that we want to apply. -def get_mapping_result(sample, bai=False): +def get_mapping_result(sample, ext=""): # Special case: we are using the mappings table of bam files, # instead of any of the ones produced with the rules here. if "mappings-table" in config["data"] and config["data"]["mappings-table"]: @@ -339,10 +391,8 @@ def get_mapping_result(sample, bai=False): if config["settings"]["recalibrate-base-qualities"]: f = "mapping/recal/{sample}.bam".format(sample=sample) - # Additionally, this function is run for getting bai files as well - if bai: - f += ".bai" - + # Additionally, this function is run for getting bai/done files as well + f += ext return f @@ -357,7 +407,12 @@ def get_sample_bams(sample): # Return the bai file(s) for a given sample def get_sample_bais(sample): - return get_mapping_result(sample, True) + return get_mapping_result(sample, ".bai") + + +# Need the done file as well +def get_sample_bams_done(sample): + return get_mapping_result(sample, ".done") # Return the bam file(s) for a sample, given a wildcard param from a rule. @@ -370,6 +425,11 @@ def get_sample_bais_wildcards(wildcards): return get_sample_bais(wildcards.sample) +# Need the done file as well +def get_sample_bams_wildcards_done(wildcards): + return get_sample_bams_done(wildcards.sample) + + # Return the bam file(s) for all samples def get_all_bams(): # Make a list of all bams in the order as the samples list. @@ -383,7 +443,15 @@ def get_all_bams(): def get_all_bais(): res = list() for sample in config["global"]["sample-names"]: - res.append(get_mapping_result(sample, True)) + res.append(get_mapping_result(sample, ".bai")) + return res + + +# We also need a list of all the "done" files to trick snakmake into working properly. +def get_all_bams_done(): + res = list() + for sample in config["global"]["sample-names"]: + res.append(get_mapping_result(sample, ".done")) return res @@ -405,7 +473,9 @@ def get_all_bais(): rule all_bams: input: + merged=get_all_sorted_sample_bams(), bams=get_all_bams(), + done=get_all_bams_done(), qc="qc/multiqc.html", output: bams=expand("mapping/final/{sample}.bam", sample=config["global"]["sample-names"]), diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk index b38b7ff..2b5ebd3 100644 --- a/workflow/rules/pileup.smk +++ b/workflow/rules/pileup.smk @@ -8,7 +8,7 @@ rule mpileup_merge_all: get_all_bams(), output: pipe("mpileup/all.merged.bam"), - touch("mpileup/all.merged.done"), + # touch("mpileup/all.merged.done"), params: # Need file overwrite flag, see above. extra=config["params"]["samtools"]["merge"] + " -f", diff --git a/workflow/rules/qc-vcf.smk b/workflow/rules/qc-vcf.smk index 0b7b332..50e2d33 100644 --- a/workflow/rules/qc-vcf.smk +++ b/workflow/rules/qc-vcf.smk @@ -11,6 +11,11 @@ rule bcftools_stats: if not config["settings"]["filter-variants"] == "none" else "calling/genotyped-all.vcf.gz" ), + done=( + "calling/filtered-all.vcf.gz.done" + if not config["settings"]["filter-variants"] == "none" + else "calling/genotyped-all.vcf.gz.done" + ), output: "qc/bcftools-stats/stats.vchk", log: diff --git a/workflow/rules/stats.smk b/workflow/rules/stats.smk index a38bf0c..3673c92 100644 --- a/workflow/rules/stats.smk +++ b/workflow/rules/stats.smk @@ -72,6 +72,11 @@ rule frequency_table: if not config["settings"]["filter-variants"] == "none" else "calling/genotyped-all.vcf.gz" ), + done=( + "calling/filtered-all.vcf.gz.done" + if not config["settings"]["filter-variants"] == "none" + else "calling/genotyped-all.vcf.gz.done" + ), output: "tables/frequencies.tsv", params: diff --git a/workflow/rules/trimming-adapterremoval.smk b/workflow/rules/trimming-adapterremoval.smk index c10f3e0..b79ee6c 100644 --- a/workflow/rules/trimming-adapterremoval.smk +++ b/workflow/rules/trimming-adapterremoval.smk @@ -13,7 +13,7 @@ rule trim_reads_se: else temp("trimming/{sample}-{unit}.fastq.gz") ), settings="trimming/{sample}-{unit}.se.settings", - done=touch("trimming/{sample}-{unit}.se.done"), + done=touch("trimming/{sample}-{unit}.fastq.gz.done"), params: extra="--gzip", params=config["params"]["adapterremoval"]["se"], @@ -45,7 +45,8 @@ rule trim_reads_pe: else temp("trimming/{sample}-{unit}.pair2.fastq.gz") ), settings="trimming/{sample}-{unit}.pe.settings", - done=touch("trimming/{sample}-{unit}.pe.done"), + done1=touch("trimming/{sample}-{unit}.pair1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}.pair2.fastq.gz.done"), params: extra="--gzip", params=config["params"]["adapterremoval"]["pe"], @@ -101,7 +102,7 @@ rule trim_reads_pe_merged: else temp("trimming/{sample}-{unit}.discarded.gz") ), settings="trimming/{sample}-{unit}.pe-merged.settings", - done=touch("trimming/{sample}-{unit}.pe-merged.done"), + done=touch("trimming/{sample}-{unit}.collapsed.gz.done"), params: extra="--gzip --collapse", params=config["params"]["adapterremoval"]["pe"], @@ -147,6 +148,11 @@ def get_trimmed_reads(wildcards): ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-cutadapt.smk b/workflow/rules/trimming-cutadapt.smk index e973cb3..96d36fb 100644 --- a/workflow/rules/trimming-cutadapt.smk +++ b/workflow/rules/trimming-cutadapt.smk @@ -13,7 +13,7 @@ rule trim_reads_se: else temp("trimming/{sample}-{unit}.fastq.gz") ), qc="trimming/{sample}-{unit}.qc-se.txt", - done=touch("trimming/{sample}-{unit}.se.done"), + done=touch("trimming/{sample}-{unit}.fastq.gz.done"), params: adapters=config["params"]["cutadapt"]["se"]["adapters"], extra=config["params"]["cutadapt"]["se"]["extra"], @@ -44,7 +44,8 @@ rule trim_reads_pe: else temp("trimming/{sample}-{unit}.2.fastq.gz") ), qc="trimming/{sample}-{unit}.qc-pe.txt", - done=touch("trimming/{sample}-{unit}.pe.done"), + done1=touch("trimming/{sample}-{unit}.1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}.2.fastq.gz.done"), params: adapters=config["params"]["cutadapt"]["pe"]["adapters"], extra=config["params"]["cutadapt"]["pe"]["extra"], @@ -81,12 +82,17 @@ def get_trimmed_reads(wildcards): # paired-end sample return expand( "trimming/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit, + pair=[1, 2], ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-fastp.smk b/workflow/rules/trimming-fastp.smk index 49c9b73..6aa5a15 100644 --- a/workflow/rules/trimming-fastp.smk +++ b/workflow/rules/trimming-fastp.smk @@ -21,7 +21,7 @@ rule trim_reads_se: ), html="trimming/{sample}-{unit}-se-fastp.html", json="trimming/{sample}-{unit}-se-fastp.json", - done=touch("trimming/{sample}-{unit}-se.done"), + done=touch("trimming/{sample}-{unit}.fastq.gz.done"), log: "logs/trimming/fastp/{sample}-{unit}.log", benchmark: @@ -46,7 +46,8 @@ rule trim_reads_pe: ), html="trimming/{sample}-{unit}-pe-fastp.html", json="trimming/{sample}-{unit}-pe-fastp.json", - done=touch("trimming/{sample}-{unit}-pe.done"), + done1=touch("trimming/{sample}-{unit}.1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}.2.fastq.gz.done"), log: "logs/trimming/fastp/{sample}-{unit}.log", benchmark: @@ -71,7 +72,7 @@ rule trim_reads_pe_merged: ), html="trimming/{sample}-{unit}-pe-merged-fastp.html", json="trimming/{sample}-{unit}-pe-merged-fastp.json", - done=touch("trimming/{sample}-{unit}-pe-merged.done"), + done=touch("trimming/{sample}-{unit}-merged.fastq.gz.done"), log: "logs/trimming/fastp/{sample}-{unit}.log", benchmark: @@ -111,12 +112,17 @@ def get_trimmed_reads(wildcards): # paired-end sample return expand( "trimming/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit, + pair=[1, 2], ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-none.smk b/workflow/rules/trimming-none.smk index dc0be5c..2b6c732 100644 --- a/workflow/rules/trimming-none.smk +++ b/workflow/rules/trimming-none.smk @@ -8,6 +8,26 @@ def get_trimmed_reads(wildcards): return list(get_fastq(wildcards).values()) +# In our current setup, we require "done" files to trick snakemake into working properly. +# These might not exist when running with raw fastq files, so we touch them here. +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + + # Check if we have touched the fastq done files already + if not hasattr(get_trimmed_reads_done, "done"): + get_trimmed_reads_done.done = False + + # If not, touch all files, then set the internal flag + # so that we do not do this every time this function is called. + if not get_trimmed_reads_done.done: + for f in files: + Path(f + ".done").touch() + get_trimmed_reads_done.done = True + + # Now we can return the fastq done file list to the caller. + return [f + ".done" for f in files] + + def get_trimming_report(sample, unit): # No trimming report here. return [] diff --git a/workflow/rules/trimming-seqprep.smk b/workflow/rules/trimming-seqprep.smk index 000a002..bd7235b 100644 --- a/workflow/rules/trimming-seqprep.smk +++ b/workflow/rules/trimming-seqprep.smk @@ -39,7 +39,9 @@ rule trim_reads_pe: if config["settings"]["keep-intermediate"]["trimming"] else temp("trimming/{sample}-{unit}.2.discarded.fastq.gz") ), - done=touch("trimming/{sample}-{unit}.done"), + donem=touch("trimming/{sample}-{unit}-merged.fastq.gz.done"), + done1=touch("trimming/{sample}-{unit}.1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}.2.fastq.gz.done"), params: extra=config["params"]["seqprep"]["extra"], log: @@ -78,12 +80,17 @@ def get_trimmed_reads(wildcards): # paired-end sample return expand( "trimming/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit, + pair=[1, 2], ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + # MultiQC does not support SeqPrep. Empty output. def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" diff --git a/workflow/rules/trimming-skewer.smk b/workflow/rules/trimming-skewer.smk index 8802f75..32ed14c 100644 --- a/workflow/rules/trimming-skewer.smk +++ b/workflow/rules/trimming-skewer.smk @@ -14,7 +14,7 @@ rule trim_reads_se: else temp("trimming/{sample}-{unit}-se-trimmed.fastq.gz") ), "trimming/{sample}-{unit}-se-trimmed.log", - touch("trimming/{sample}-{unit}-se.done"), + touch("trimming/{sample}-{unit}-se-trimmed.fastq.gz.done"), params: extra="--format sanger --compress", params=config["params"]["skewer"]["se"], @@ -47,7 +47,8 @@ rule trim_reads_pe: else temp("trimming/{sample}-{unit}-pe-trimmed-pair2.fastq.gz") ), log="trimming/{sample}-{unit}-pe-trimmed.log", - done=touch("trimming/{sample}-{unit}-pe.done"), + done1=touch("trimming/{sample}-{unit}-pe-trimmed-pair1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}-pe-trimmed-pair2.fastq.gz.done"), params: extra="--format sanger --compress", params=config["params"]["skewer"]["pe"], @@ -93,6 +94,12 @@ def get_trimmed_reads(wildcards): ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-trimmomatic.smk b/workflow/rules/trimming-trimmomatic.smk index 2fffd97..e94a6b3 100644 --- a/workflow/rules/trimming-trimmomatic.smk +++ b/workflow/rules/trimming-trimmomatic.smk @@ -13,7 +13,7 @@ rule trim_reads_se: else temp("trimming/{sample}-{unit}.fastq.gz") ), # trimlog="trimming/{sample}-{unit}-se.trimlog.log" - touch("trimming/{sample}-{unit}-se.done"), + touch("trimming/{sample}-{unit}.fastq.gz.done"), params: # extra=lambda w, output: "-trimlog {}".format(output.trimlog), extra=config["params"]["trimmomatic"]["se"]["extra"], @@ -55,7 +55,8 @@ rule trim_reads_pe: else temp("trimming/{sample}-{unit}.2.unpaired.fastq.gz") ), # trimlog="trimming/{sample}-{unit}-pe.trimlog.log" - done=touch("trimming/{sample}-{unit}-pe.done"), + done1=touch("trimming/{sample}-{unit}.1.fastq.gz.done"), + done2=touch("trimming/{sample}-{unit}.2.fastq.gz.done"), params: # extra=lambda w, output: "-trimlog {}".format(output.trimlog), extra=config["params"]["trimmomatic"]["se"]["extra"], @@ -99,6 +100,11 @@ def get_trimmed_reads(wildcards): ) +def get_trimmed_reads_done(wildcards): + files = get_trimmed_reads(wildcards) + return [f + ".done" for f in files] + + # MultiQC expects the normal stdout log files from trimmomatic, but as we use a wrapper for the latter, # we cannot also declare the log files as output files, because snakemake... # Instead, we copy them afterwards. This is dirty, but that's how the snake rolls...