Skip to content

Commit

Permalink
Request done files for all rules
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Feb 10, 2025
1 parent 3a61807 commit c772696
Show file tree
Hide file tree
Showing 34 changed files with 323 additions and 86 deletions.
4 changes: 2 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ params:

# Extra parameters for MarkDuplicates.
# See https://gatk.broadinstitute.org/hc/en-us/articles/360057439771-MarkDuplicates-Picard
MarkDuplicates: "REMOVE_DUPLICATES=true"
MarkDuplicates: "--REMOVE_DUPLICATES true"

# Run several Picard QC tools, as needed, using Picard CollectMultipleMetrics.
# See https://gatk.broadinstitute.org/hc/en-us/articles/360042478112-CollectMultipleMetrics-Picard
Expand Down Expand Up @@ -670,7 +670,7 @@ params:
# or `-Djava.io.tmpdir=/path/to/tmp` to set a directory for temporary data, in case that the
# system-provided tmp dir is too small (which can happen on clusters).
# The last option, SortVcf-java-opts, is used by bcftools when using contig-group-size > 0.
MarkDuplicates-java-opts: "-Xmx10g"
MarkDuplicates-java-opts: ""
CollectMultipleMetrics-java-opts: ""
SortVcf-java-opts: ""

Expand Down
4 changes: 4 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@ rule all:
# Basic steps
"mapping/final.done",
"calling/genotyped-all.vcf.gz",
"calling/genotyped-all.vcf.gz.done",
"calling/filtered-all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [],
"calling/filtered-all.vcf.gz.done" if not config["settings"]["filter-variants"] == "none" else [],
"annotation/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
"annotation/snpeff.vcf.gz.done" if config["settings"]["snpeff"] else [],
"annotation/vep.vcf.gz" if config["settings"]["vep"] else [],
"annotation/vep.vcf.gz.done" if config["settings"]["vep"] else [],
# Quality control
"qc/multiqc.html",
# Reference genome statistics
Expand Down
2 changes: 1 addition & 1 deletion workflow/envs/picard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ dependencies:
- numpy ==2.0.0

# Tools
- picard ==3.2.0
- picard ==3.3.0

# Need tidyverse because of issue #9
# We are now using the version that gets installed when specifying no version,
Expand Down
12 changes: 12 additions & 0 deletions workflow/rules/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ rule snpeff:
if not config["settings"]["filter-variants"] == "none"
else "calling/genotyped-all.vcf.gz"
),
done=(
"calling/filtered-all.vcf.gz.done"
if not config["settings"]["filter-variants"] == "none"
else "calling/genotyped-all.vcf.gz.done"
),
# path to reference db downloaded with the snpeff download wrapper above
db=get_snpeff_db_path(),
output:
Expand All @@ -77,6 +82,7 @@ rule snpeff:
stats=report("annotation/snpeff.html", category="Calls"),
# summary statistics in CSV, optional
csvstats="annotation/snpeff.csv",
done=touch("annotation/snpeff.vcf.gz.done")
log:
"logs/annotation/snpeff.log",
group:
Expand Down Expand Up @@ -191,6 +197,11 @@ rule vep:
if not config["settings"]["filter-variants"] == "none"
else "calling/genotyped-all.vcf.gz"
),
done=(
"calling/filtered-all.vcf.gz.done"
if not config["settings"]["filter-variants"] == "none"
else "calling/genotyped-all.vcf.gz.done"
),
cache=get_vep_cache_dir(),
plugins=get_vep_plugins_dir(),
output:
Expand All @@ -209,6 +220,7 @@ rule vep:
caption="../report/stats.rst",
category="Calls",
),
done=touch("annotation/vep.vcf.gz.done")
params:
# Pass a list of plugins to use,
# see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
Expand Down
21 changes: 16 additions & 5 deletions workflow/rules/calling-bcftools-combined.smk
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ rule call_variants:
# similar to what our GATK HaplotypeCaller rule does.
samples=get_all_bams(),
indices=get_all_bais(),
done=get_all_bams_done(),
# If we use restricted regions, set them here. If not, empty, which will propagate to the
# get_mpileup_params function as well. Same for small contig groups.
# regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else []
Expand All @@ -34,7 +35,7 @@ rule call_variants:
else temp("calling/called/{contig}.vcf.gz")
),
# vcf=protected("calling/called/{contig}.vcf.gz")
done=touch("calling/called/{contig}.done"),
done=touch("calling/called/{contig}.vcf.gz.done"),
params:
# Optional parameters for bcftools mpileup (except -g, -f).
mpileup=get_mpileup_params,
Expand Down Expand Up @@ -86,8 +87,17 @@ def merge_variants_vcfs_input(wildcards):
# Need index files for some of the downstream tools.
# Rational for the fai: see merge_variants_vcfs_input()
def merge_variants_tbis_input(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("calling/called/{contig}.vcf.gz.tbi", contig=get_contigs(fai))
files = merge_variants_vcfs_input(wildcards)
return [f + ".tbi" for f in files]
# fai = checkpoints.samtools_faidx.get().output[0]
# return expand("calling/called/{contig}.vcf.gz.tbi", contig=get_contigs(fai))


# Lastly, we want to make sure that all inputs are actually done,
# as snakemake can mess this up still with slurm etc... :-(
def merge_variants_done_input(wildcards):
files = merge_variants_vcfs_input(wildcards)
return [f + ".done" for f in files]


# bcftools does not automatically create vcf index files, so we need a rule for that...
Expand Down Expand Up @@ -159,6 +169,7 @@ rule merge_variants:
contig_groups=contigs_groups_input,
vcfs=merge_variants_vcfs_input,
tbis=merge_variants_tbis_input,
done=merge_variants_done_input,
output:
# With small contigs, we also need sorting, see below.
# Unfortunately, we cannot pipe here, as Picard fails with that, so temp file it is...
Expand All @@ -169,9 +180,9 @@ rule merge_variants:
else "calling/genotyped-all.vcf.gz"
),
done=(
touch("calling/merged/merged-all.done")
touch("calling/merged/merged-all.vcf.gz.done")
if (config["settings"].get("contig-group-size"))
else touch("calling/genotyped-all.done")
else touch("calling/genotyped-all.vcf.gz.done")
),
log:
"logs/calling/vcflib/merge-genotyped.log",
Expand Down
22 changes: 16 additions & 6 deletions workflow/rules/calling-bcftools-individual.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ rule call_variants:
# Get the sample data.
samples=get_sample_bams_wildcards,
indices=get_sample_bais_wildcards,
done=get_sample_bams_wildcards_done,
# If we use restricted regions, set them here. If not, empty, which will propagate to the
# get_mpileup_params function as well. Same for small contig groups.
# regions="calling/regions/{contig}.bed" if config["settings"].get("restrict-regions") else []
Expand All @@ -33,7 +34,7 @@ rule call_variants:
else temp("calling/called/{sample}.{contig}.g.vcf.gz")
),
gtbi="calling/called/{sample}.{contig}.g.vcf.gz.tbi",
done=touch("calling/called/{sample}.{contig}.g.done"),
done=touch("calling/called/{sample}.{contig}.g.vcf.gz.done"),
params:
# Optional parameters for bcftools mpileup (except -g, -f).
mpileup=get_mpileup_params,
Expand All @@ -54,8 +55,6 @@ rule call_variants:
"("
"bcftools mpileup "
" {params.mpileup} --fasta-ref {input.ref} --output-type u {input.samples} "


" -a 'INFO/AD' -a 'FORMAT/AD' -a 'FORMAT/DP' --gvcf 0 | "
"bcftools call "
" --threads {threads} {params.call} --gvcf 0 --output-type u | "
Expand Down Expand Up @@ -90,11 +89,15 @@ rule combine_contig:
"calling/called/{sample}.{{contig}}.g.vcf.gz.tbi",
sample=config["global"]["sample-names"],
),
done=expand(
"calling/called/{sample}.{{contig}}.g.vcf.gz.done",
sample=config["global"]["sample-names"],
),
output:
gvcf="calling/combined/all.{contig}.g.vcf.gz",
gtbi="calling/combined/all.{contig}.g.vcf.gz.tbi",
gvcflist="calling/combined/all.{contig}.g.txt",
done=touch("calling/combined/all.{contig}.g.done"),
done=touch("calling/combined/all.{contig}.g.vcf.gz.done"),
log:
"logs/calling/bcftools/combine-contig-{contig}.log",
benchmark:
Expand Down Expand Up @@ -131,6 +134,12 @@ def combined_contig_gvcfs(wildcards):
return expand("calling/combined/all.{contig}.g.vcf.gz", contig=get_contigs(fai))


# Also need the done files to make sure snakemake doesn't mess this up.
def combined_contig_done(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("calling/called/all.{contig}.g.vcf.gz.done", contig=get_contigs(fai))


# We also need a comma-separated list of the contigs, so that bcftools can output
# the concatenated entries in the correct order as given in the fai.
# For this, we use the same technique of using the fai checkpoint as before.
Expand All @@ -154,6 +163,7 @@ rule combine_all:
ref=get_fai,
contig_groups=contigs_groups_input,
gvcfs=combined_contig_gvcfs,
dones=combined_contig_done,
output:
# vcf="calling/genotyped-all.vcf.gz",
# tbi="calling/genotyped/all.vcf.gz.tbi",
Expand All @@ -175,9 +185,9 @@ rule combine_all:
else "calling/genotyped-all.txt"
),
done=(
touch("calling/merged/merged-all.done")
touch("calling/merged/merged-all.vcf.gz.done")
if (config["settings"].get("contig-group-size"))
else touch("calling/genotyped-all.done")
else touch("calling/genotyped-all.vcf.gz.done")
),
params:
# Use a list of the chromosomes in the same order as the fai, for bcftools to sort the output.
Expand Down
3 changes: 2 additions & 1 deletion workflow/rules/calling-bcftools.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ if config["settings"].get("contig-group-size"):
rule sort_variants:
input:
vcf="calling/merged/merged-all.vcf.gz",
done="calling/merged/merged-all.vcf.gz.done",
refdict=genome_dict(),
output:
vcf="calling/genotyped-all.vcf.gz",
done=touch("calling/genotyped-all.done"),
done=touch("calling/genotyped-all.vcf.gz.done"),
params:
# See duplicates-picard.smk for the reason whe need this on MacOS.
extra=(
Expand Down
12 changes: 10 additions & 2 deletions workflow/rules/calling-freebayes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ rule call_variants:
# Without bai files, freebayes claims that it recomputes them, but actually crashes...
samples=get_all_bams(),
indices=get_all_bais(),
dones=get_all_bams_done(),
# If known variants are set, use this as input to ensure the file exists.
# We use the file via the know_variants_extra() function call below,
# but request it here as well to ensure that it and its index are present.
Expand All @@ -58,7 +59,7 @@ rule call_variants:
),
output:
pipe("calling/called/{contig}.vcf"),
touch("calling/called/{contig}.vcf.done"),
# touch("calling/called/{contig}.vcf.done"),
log:
"logs/calling/freebayes/{contig}.log",
benchmark:
Expand Down Expand Up @@ -117,6 +118,12 @@ def merge_variants_vcfs_input(wildcards):
return expand("calling/called/{contig}.vcf.gz", contig=get_contigs(fai))


# Need to work around snakemake issues by requiring the done files
def merge_variants_vcfs_input_done(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("calling/called/{contig}.vcf.gz.done", contig=get_contigs(fai))


rule merge_variants:
input:
# fai is needed to calculate aggregation over contigs below.
Expand All @@ -128,9 +135,10 @@ rule merge_variants:
# The wrapper expects input to be called `vcfs`, but we can use `vcf.gz` as well.
# vcfs=lambda w: expand("calling/called/{contig}.vcf.gz", contig=get_contigs())
vcfs=merge_variants_vcfs_input,
done=merge_variants_vcfs_input_done,
output:
vcf="calling/genotyped-all.vcf.gz",
done=touch("calling/genotyped-all.done"),
done=touch("calling/genotyped-all.vcf.gz.done"),
params:
# See duplicates-picard.smk for the reason whe need this on MacOS.
extra=(
Expand Down
21 changes: 17 additions & 4 deletions workflow/rules/calling-haplotypecaller.smk
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ rule call_variants:
# Get the sample data.
bam=get_sample_bams_wildcards,
bai=get_sample_bais_wildcards,
done=get_sample_bams_wildcards_done,
# Get the reference genome, as well as its indices.
ref=config["data"]["reference-genome"],
refidcs=expand(
Expand Down Expand Up @@ -60,7 +61,7 @@ rule call_variants:
if config["settings"]["keep-intermediate"]["calling"]
else temp("calling/called/{sample}.{contig}.g.vcf.gz.tbi")
),
done=touch("calling/called/{sample}.{contig}.g.done"),
done=touch("calling/called/{sample}.{contig}.g.vcf.gz.done"),
log:
"logs/calling/gatk-haplotypecaller/{sample}.{contig}.log",
benchmark:
Expand Down Expand Up @@ -135,13 +136,17 @@ rule combine_calls:
"calling/called/{sample}.{{contig}}.g.vcf.gz.tbi",
sample=config["global"]["sample-names"],
),
done=expand(
"calling/called/{sample}.{{contig}}.g.vcf.gz.done",
sample=config["global"]["sample-names"],
),
output:
gvcf=(
"calling/combined/all.{contig}.g.vcf.gz"
if config["settings"]["keep-intermediate"]["calling"]
else temp("calling/combined/all.{contig}.g.vcf.gz")
),
done=touch("calling/combined/all.{contig}.g.done"),
done=touch("calling/combined/all.{contig}.g.vcf.gz.done"),
params:
extra=config["params"]["gatk"]["CombineGVCFs-extra"]
+ (
Expand Down Expand Up @@ -173,13 +178,14 @@ rule genotype_variants:
),
refdict=genome_dict(),
gvcf="calling/combined/all.{contig}.g.vcf.gz",
done="calling/combined/all.{contig}.g.vcf.gz.done",
output:
vcf=(
"calling/genotyped/all.{contig}.vcf.gz"
if config["settings"]["keep-intermediate"]["calling"]
else temp("calling/genotyped/all.{contig}.vcf.gz")
),
done=touch("calling/genotyped/all.{contig}.done"),
done=touch("calling/genotyped/all.{contig}.vcf.gz.done"),
params:
extra=config["params"]["gatk"]["GenotypeGVCFs-extra"]
+ (
Expand Down Expand Up @@ -211,6 +217,12 @@ def merge_variants_vcfs_input(wildcards):
return expand("calling/genotyped/all.{contig}.vcf.gz", contig=get_contigs(fai))


# Also need to trick snakemake into completing all files...
def merge_variants_vcfs_input_done(wildcards):
fai = checkpoints.samtools_faidx.get().output[0]
return expand("calling/genotyped/all.{contig}.vcf.gz.done", contig=get_contigs(fai))


rule merge_variants:
input:
# fai is needed to calculate aggregation over contigs below.
Expand All @@ -221,9 +233,10 @@ rule merge_variants:
contig_groups=contigs_groups_input,
# vcfs=lambda w: expand("calling/genotyped/all.{contig}.vcf.gz", contig=get_contigs())
vcfs=merge_variants_vcfs_input,
done=merge_variants_vcfs_input_done,
output:
vcf="calling/genotyped-all.vcf.gz",
done=touch("calling/genotyped-all.done"),
done=touch("calling/genotyped-all.vcf.gz.done"),
params:
# See duplicates-picard.smk for the reason whe need this on MacOS.
extra=(
Expand Down
2 changes: 2 additions & 0 deletions workflow/rules/damage.smk
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ rule mapdamage:
# Get either the normal mapping output, or, if additional filtering via `samtools view`
# is set in the config settings: filter-mapped-reads, use the filtered output instead.
get_mapped_reads,
get_mapped_reads_done,
# "mapping/sorted/{sample}-{unit}.bam"
# Get the reference genome and its indices. Not sure if the indices are needed
# for this particular rule, but doesn't hurt to include them as an input anyway.
Expand Down Expand Up @@ -60,6 +61,7 @@ rule damageprofiler:
# Get either the normal mapping output, or, if additional filtering via `samtools view`
# is set in the config settings: filter-mapped-reads, use the filtered output instead.
get_mapped_reads,
get_mapped_reads_done,
# "mapping/sorted/{sample}-{unit}.bam"
# Get the reference genome and its indices. Not sure if the indices are needed
# for this particular rule, but doesn't hurt to include them as an input anyway.
Expand Down
6 changes: 4 additions & 2 deletions workflow/rules/duplicates-dedup.smk
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ rule mark_duplicates:
# or the clipped ones, if those were requested.
# We always use the merged units per sample here.
get_mapped_reads,
get_mapped_reads_done,
output:
bam=temp("mapping/dedup/{sample}_rmdup.bam"),
metrics="mapping/dedup/{sample}.dedup.json",
Expand All @@ -46,7 +47,7 @@ rule mark_duplicates:
shell:
# Dedup bases its output names on this, so we need some more trickery to make it
# output the file names that we want.
"dedup -i {input} -o {params.out_dir} {params.extra} > {log} 2>&1 ; "
"dedup -i {input[0]} -o {params.out_dir} {params.extra} > {log} 2>&1 ; "
# "mv"
# " dedup/{wildcards.sample}." + get_mapped_read_infix() + "_rmdup.bam"
# " dedup/{wildcards.sample}_rmdup.bam ; "
Expand All @@ -58,13 +59,14 @@ rule mark_duplicates:
rule sort_reads_dedup:
input:
"mapping/dedup/{sample}_rmdup.bam",
"mapping/dedup/{sample}.dedup.done",
output:
(
"mapping/dedup/{sample}.bam"
if config["settings"]["keep-intermediate"]["mapping"]
else temp("mapping/dedup/{sample}.bam")
),
touch("mapping/dedup/{sample}.done"),
touch("mapping/dedup/{sample}.bam.done"),
params:
extra=config["params"]["samtools"]["sort"],
tmp_dir=config["params"]["samtools"]["temp-dir"],
Expand Down
Loading

0 comments on commit c772696

Please sign in to comment.