Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def check_config(value, default=False, place=config):
return place[value] if (value in place and place[value]) else default

no_variant_calling = check_config('vcf_file')
unpaired = check_config('unpaired')

def read_samples():
"""Function to get names and dna/rna fastq paths from a sample file
Expand All @@ -23,17 +24,24 @@ def read_samples():
<vcf_sample_id> <unique_sample_name> <dna_bam_path> <fastq1_rna_path> <fastq2_rna_path>
or (optionally) just these four:
<vcf_sample_id> <unique_sample_name> <fastq1_rna_path> <fastq2_rna_path>
Or if the unpaired config option is also provided, the input file
is expected to have these four columns:
<vcf_sample_id> <unique_sample_name> <dna_bam_path> <fastq_rna_path>
or (optionally) just these three:
<vcf_sample_id> <unique_sample_name> <fastq_rna_path>
Modify this function as needed to provide the correct dictionary."""
f = open(config['sample_file'], "r")
samp_dict = {}
for line in f:
words = line.strip().split("\t")
if no_variant_calling:
if len(words) == 5:
samp_dict[words[1]] = (words[2], (words[3], words[4]))
elif len(words) == 4:
if len(words) == (5-unpaired):
fastq = words[3] if unpaired else (words[3], words[4])
samp_dict[words[1]] = (words[2], fastq)
elif len(words) == (4-unpaired):
# then the dna_bam_path hasn't been specified
samp_dict[words[1]] = ((words[2], words[3]),)
fastq = words[2] if unpaired else (words[2], words[3])
samp_dict[words[1]] = (fastq,)
# sanity check to make sure that rna_only is set to true
config['rna_only'] = True
else:
Expand Down
80 changes: 59 additions & 21 deletions Snakefiles/Snakefile-WASP
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,35 @@ min_version("5.18.0")

include: "snp2h5_rules.smk"

rule uncompress_ref:
"""temporarily uncompress gzipped files for the STAR index"""
input:
ref = lambda wildcards: config['ref_genome']
output:
ref = temp(config['output_dir'] + "/ref_genome/"+Path(config['ref_genome']).stem)
conda: "../envs/default.yaml"
benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_ref/all.tsv"
shell:
"gunzip -c {input.ref} > {output.ref}"

rule uncompress_genes:
"""temporarily uncompress gzipped files for the STAR index"""
input:
genes = config['gene_info']
output:
genes = temp(config['output_dir'] + "/ref_genome/"+Path(config['gene_info']).stem)
conda: "../envs/default.yaml"
benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_genes/all.tsv"
shell:
"gunzip -c {input.genes} > {output.genes}"

rule create_STAR_index:
"""create an index of the reference genome using STAR"""
input:
ref = lambda wildcards: config['ref_genome'],
genes = config['gene_info']
ref = lambda wildcards: rules.uncompress_ref.output.ref if \
config['ref_genome'].endswith('.gz') else config['ref_genome'],
genes = rules.uncompress_genes.output.genes if \
config['gene_info'].endswith('.gz') else config['gene_info']
params:
overhang = config['sjdbOverhang'] if 'sjdbOverhang' in config and \
config['sjdbOverhang'] else 100
Expand All @@ -27,11 +51,23 @@ rule create_STAR_index:
"--sjdbOverhang {params.overhang} --runMode genomeGenerate "
"--genomeFastaFiles {input.ref} --sjdbGTFfile {input.genes}"

rule create_BWA_index:
"""create an index of the reference genome using BWA"""
input:
ref = lambda wildcards: config['ref_genome']
params:
ref = lambda wildcards, output: output[0] + "/" + Path(config['ref_genome']).name
output: directory(config['output_dir'] + "/ref_genome/bwa_index")
conda: "../envs/default.yaml"
benchmark: config['output_dir'] + "/benchmark/WASP/create_BWA_index/all.tsv"
shell:
"ln -sf {input.ref} {params.ref} && "
"bwa index {params.ref}"

rule map_STAR1:
"""map reads using STAR"""
input:
fastq1 = lambda wildcards: SAMP2[wildcards.sample][0],
fastq2 = lambda wildcards: SAMP2[wildcards.sample][1],
fastq = lambda wildcards: SAMP2[wildcards.sample],
index = config['ref_genome_star'] if 'ref_genome_star' in config and \
config['ref_genome_star'] else rules.create_STAR_index.output
params:
Expand All @@ -45,7 +81,7 @@ rule map_STAR1:
shell:
"STAR --runThreadN 1 "
"--genomeDir {input.index} "
"--readFilesIn {input.fastq1} {input.fastq2} "
"--readFilesIn {input.fastq} "
"--readFilesCommand zcat "
"--outSAMtype BAM Unsorted "
"--alignEndsType EndToEnd "
Expand All @@ -54,17 +90,16 @@ rule map_STAR1:
rule map_BWA1:
"""map ATAC-seq reads using BWA-MEM"""
input:
fastq1 = lambda wildcards: SAMP2[wildcards.sample][0],
fastq2 = lambda wildcards: SAMP2[wildcards.sample][1],
ref = config['ref_genome']
fastq = lambda wildcards: SAMP2[wildcards.sample],
ref = lambda wildcards: config['ref_genome']
output:
config['output_dir'] + "/map1/{sample}/aln.bam"
conda: "../envs/default.yaml"
benchmark: config['output_dir'] + "/benchmark/WASP/map_BWA1/{sample}.tsv"
resources:
mem_mb = 30000
shell:
"bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}"
"bwa mem {input.ref} {input.fastq} > {output}"

rule sort_and_index_bam1:
"""sort and index bam generated by first mapping step"""
Expand Down Expand Up @@ -93,10 +128,14 @@ rule find_intersecting_snps:
find_intersecting_snps_script = rules.get_WASP.output.find_intersecting_snps_script
params:
sample_names = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample],
output_dir = lambda wildcards, output: str(Path(output.fastq1).parent)
output_dir = lambda wildcards, output: str(Path(output.remap_bam).parent),
is_paired = "" if 'unpaired' in config and config['unpaired'] else "--is_paired_end "
output:
fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz",
fastq = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz" \
if 'unpaired' in config and config['unpaired'] else [
config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz"
],
keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"),
remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam"
conda: "../envs/default.yaml"
Expand All @@ -105,7 +144,7 @@ rule find_intersecting_snps:
mem_mb = 10000
shell:
"python {input.find_intersecting_snps_script} "
"--is_paired_end "
"{params.is_paired}"
"--is_sorted "
"--output_dir {params.output_dir} "
"--snp_tab {input.snp_tab} "
Expand All @@ -117,8 +156,7 @@ rule find_intersecting_snps:
rule map_STAR2:
"""map reads a second time using STAR"""
input:
fastq1 = rules.find_intersecting_snps.output.fastq1,
fastq2 = rules.find_intersecting_snps.output.fastq2,
fastq = rules.find_intersecting_snps.output.fastq,
reference = config['ref_genome_star'] if 'ref_genome_star' in config and \
config['ref_genome_star'] else rules.create_STAR_index.output
params:
Expand All @@ -132,7 +170,7 @@ rule map_STAR2:
shell:
"STAR --runThreadN 1 "
"--genomeDir {input.reference} "
"--readFilesIn {input.fastq1} {input.fastq2} "
"--readFilesIn {input.fastq} "
"--readFilesCommand zcat "
"--outSAMtype BAM Unsorted "
"--alignEndsType EndToEnd "
Expand All @@ -141,17 +179,16 @@ rule map_STAR2:
rule map_BWA2:
"""map ATAC-seq reads using BWA-MEM"""
input:
fastq1 = rules.find_intersecting_snps.output.fastq1,
fastq2 = rules.find_intersecting_snps.output.fastq2,
ref = config['ref_genome']
fastq = rules.find_intersecting_snps.output.fastq,
ref = lambda wildcards: config['ref_genome']
output:
config['output_dir'] + "/map2/{sample}/aln.bam"
conda: "../envs/default.yaml"
benchmark: config['output_dir'] + "/benchmark/WASP/map_BWA2/{sample}.tsv"
resources:
mem_mb = 30000
shell:
"bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}"
"bwa mem {input.ref} {input.fastq} > {output}"

rule sort_and_index_bam2:
"""sort and index bam generated by second mapping step"""
Expand Down Expand Up @@ -207,7 +244,8 @@ rule rmdup:
input:
bam = rules.sort_filtered_bam.output.bam,
index = rules.sort_filtered_bam.output.index,
rmdup_script = rules.get_WASP.output.rmdup_script
rmdup_script = rules.get_WASP.output.rmdup_script if 'unpaired' in config and \
config['unpaired'] else rules.get_WASP.output.rmdup_pe_script
output:
rmdup = temp(config['output_dir'] + "/rmdup/{sample}.keep.rmdup.bam"),
sort = config['output_dir'] + "/rmdup/{sample}.keep.rmdup.sort.bam",
Expand Down
3 changes: 3 additions & 0 deletions Snakefiles/Snakefile-counts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ rule get_as_counts:
bam = lambda wildcards: SAMP3[wildcards.sample][
wildcards.type == "rna" and not config['rna_only']
],
bam_idx = lambda wildcards: SAMP3[wildcards.sample][
wildcards.type == "rna" and not config['rna_only']
]+".bai",
snp_index = rules.vcf2h5.output.snp_index,
snp_tab = rules.vcf2h5.output.snp_tab,
haplotype = rules.vcf2h5.output.haplotype,
Expand Down
3 changes: 2 additions & 1 deletion Snakefiles/snp2h5_rules.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ rule get_WASP:
output:
find_intersecting_snps_script = config['wasp_dir'] + "/mapping/find_intersecting_snps.py",
filter_remapped_reads_script = config['wasp_dir'] + "/mapping/filter_remapped_reads.py",
rmdup_script = config['wasp_dir'] + "/mapping/rmdup_pe.py",
rmdup_script = config['wasp_dir'] + "/mapping/rmdup.py",
rmdup_pe_script = config['wasp_dir'] + "/mapping/rmdup_pe.py",
bam2h5_script = config['wasp_dir'] + "/CHT/bam2h5.py",
makefile = config['wasp_dir'] + "/snp2h5/Makefile"
conda: "../envs/default.yaml"
Expand Down
6 changes: 6 additions & 0 deletions configs/config-WASP.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
# conservative version of the ASE test (see the 'rna_only' config below)
"sample_file" : "data/samples.tsv"

# If your RNA-seq data is single-end instead of paired-end, set this config
# to true. In that case, you can provide only one FASTQ file in each line of
# the samples file above. Otherwise, if this option is set to a falsey value
# or commented out, both FASTQs will be used.
"unpaired": false

# The path to a reference genome
# This is not required if "ref_genome_star" is provided below
# Note: the FASTA file should not be gzipped
Expand Down
3 changes: 2 additions & 1 deletion run
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ if command -v 'slack' &>/dev/null; then
if [ "$exit_code" -eq 0 ]; then
slack "snakemake finished successfully" &>/dev/null
else
slack "$(tail -n4 "$out_path/log")" &>/dev/null
slack "snakemake as_analysis job failed" &>/dev/null
slack "$(tail -n4 "$out_path/log")" &>/dev/null
fi
fi
exit "$exit_code"