aryarm · aryarm · Jan 19, 2021 · Jan 19, 2021 · Jan 20, 2021 · Jan 20, 2021
diff --git a/Snakefile b/Snakefile
@@ -12,6 +12,7 @@ def check_config(value, default=False, place=config):
     return place[value] if (value in place and place[value]) else default
 
 no_variant_calling = check_config('vcf_file')
+unpaired = check_config('unpaired')
 
 def read_samples():
     """Function to get names and dna/rna fastq paths from a sample file
@@ -23,17 +24,24 @@ def read_samples():
     <vcf_sample_id> <unique_sample_name> <dna_bam_path> <fastq1_rna_path> <fastq2_rna_path>
     or (optionally) just these four:
     <vcf_sample_id> <unique_sample_name> <fastq1_rna_path> <fastq2_rna_path>
+    Or if the unpaired config option is also provided, the input file
+    is expected to have these four columns:
+    <vcf_sample_id> <unique_sample_name> <dna_bam_path> <fastq_rna_path>
+    or (optionally) just these three:
+    <vcf_sample_id> <unique_sample_name> <fastq_rna_path>
     Modify this function as needed to provide the correct dictionary."""
     f = open(config['sample_file'], "r")
     samp_dict = {}
     for line in f:
         words = line.strip().split("\t")
         if no_variant_calling:
-            if len(words) == 5:
-                samp_dict[words[1]] = (words[2], (words[3], words[4]))
-            elif len(words) == 4:
+            if len(words) == (5-unpaired):
+                fastq = words[3] if unpaired else (words[3], words[4])
+                samp_dict[words[1]] = (words[2], fastq)
+            elif len(words) == (4-unpaired):
                 # then the dna_bam_path hasn't been specified
-                samp_dict[words[1]] = ((words[2], words[3]),)
+                fastq = words[2] if unpaired else (words[2], words[3])
+                samp_dict[words[1]] = (fastq,)
                 # sanity check to make sure that rna_only is set to true
                 config['rna_only'] = True
             else:

diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP
@@ -8,11 +8,35 @@ min_version("5.18.0")
 
 include: "snp2h5_rules.smk"
 
+rule uncompress_ref:
+    """temporarily uncompress gzipped files for the STAR index"""
+    input:
+        ref = lambda wildcards: config['ref_genome']
+    output:
+        ref = temp(config['output_dir'] + "/ref_genome/"+Path(config['ref_genome']).stem)
+    conda: "../envs/default.yaml"
+    benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_ref/all.tsv"
+    shell:
+        "gunzip -c {input.ref} > {output.ref}"
+
+rule uncompress_genes:
+    """temporarily uncompress gzipped files for the STAR index"""
+    input:
+        genes = config['gene_info']
+    output:
+        genes = temp(config['output_dir'] + "/ref_genome/"+Path(config['gene_info']).stem)
+    conda: "../envs/default.yaml"
+    benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_genes/all.tsv"
+    shell:
+        "gunzip -c {input.genes} > {output.genes}"
+
 rule create_STAR_index:
     """create an index of the reference genome using STAR"""
     input:
-        ref = lambda wildcards: config['ref_genome'],
-        genes = config['gene_info']
+        ref = lambda wildcards: rules.uncompress_ref.output.ref if \
+            config['ref_genome'].endswith('.gz') else config['ref_genome'],
+        genes = rules.uncompress_genes.output.genes if \
+            config['gene_info'].endswith('.gz') else config['gene_info']
     params:
         overhang = config['sjdbOverhang'] if 'sjdbOverhang' in config and \
             config['sjdbOverhang'] else 100
@@ -27,11 +51,23 @@ rule create_STAR_index:
         "--sjdbOverhang {params.overhang} --runMode genomeGenerate "
         "--genomeFastaFiles {input.ref} --sjdbGTFfile {input.genes}"
 
+rule create_BWA_index:
+    """create an index of the reference genome using BWA"""
+    input:
+        ref = lambda wildcards: config['ref_genome']
+    params:
+        ref = lambda wildcards, output: output[0] + "/" + Path(config['ref_genome']).name
+    output: directory(config['output_dir'] + "/ref_genome/bwa_index")
+    conda: "../envs/default.yaml"
+    benchmark: config['output_dir'] + "/benchmark/WASP/create_BWA_index/all.tsv"
+    shell:
+        "ln -sf {input.ref} {params.ref} && "
+        "bwa index {params.ref}"
+
 rule map_STAR1:
     """map reads using STAR"""
     input:
-        fastq1 = lambda wildcards: SAMP2[wildcards.sample][0],
-        fastq2 = lambda wildcards: SAMP2[wildcards.sample][1],
+        fastq = lambda wildcards: SAMP2[wildcards.sample],
         index = config['ref_genome_star'] if 'ref_genome_star' in config and \
             config['ref_genome_star'] else rules.create_STAR_index.output
     params:
@@ -45,7 +81,7 @@ rule map_STAR1:
     shell:
         "STAR --runThreadN 1 "
             "--genomeDir {input.index} "
-            "--readFilesIn {input.fastq1} {input.fastq2} "
+            "--readFilesIn {input.fastq} "
             "--readFilesCommand zcat "
             "--outSAMtype BAM Unsorted "
             "--alignEndsType EndToEnd "
@@ -54,17 +90,16 @@ rule map_STAR1:
 rule map_BWA1:
     """map ATAC-seq reads using BWA-MEM"""
     input:
-        fastq1 = lambda wildcards: SAMP2[wildcards.sample][0],
-        fastq2 = lambda wildcards: SAMP2[wildcards.sample][1],
-        ref = config['ref_genome']
+        fastq = lambda wildcards: SAMP2[wildcards.sample],
+        ref = lambda wildcards: config['ref_genome']
     output:
         config['output_dir'] + "/map1/{sample}/aln.bam"
     conda: "../envs/default.yaml"
     benchmark: config['output_dir'] + "/benchmark/WASP/map_BWA1/{sample}.tsv"
     resources:
         mem_mb = 30000
     shell:
-        "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}"
+        "bwa mem {input.ref} {input.fastq} > {output}"
 
 rule sort_and_index_bam1:
     """sort and index bam generated by first mapping step"""
@@ -93,10 +128,14 @@ rule find_intersecting_snps:
         find_intersecting_snps_script = rules.get_WASP.output.find_intersecting_snps_script
     params:
         sample_names = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample],
-        output_dir = lambda wildcards, output: str(Path(output.fastq1).parent)
+        output_dir = lambda wildcards, output: str(Path(output.remap_bam).parent),
+        is_paired = "" if 'unpaired' in config and config['unpaired'] else "--is_paired_end "
     output:
-        fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
-        fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz",
+        fastq = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz" \
+            if 'unpaired' in config and config['unpaired'] else [
+            config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz",
+            config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz"
+        ],
         keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"),
         remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam"
     conda: "../envs/default.yaml"
@@ -105,7 +144,7 @@ rule find_intersecting_snps:
         mem_mb = 10000
     shell:
         "python {input.find_intersecting_snps_script} "
-            "--is_paired_end "
+            "{params.is_paired}"
             "--is_sorted "
             "--output_dir {params.output_dir} "
             "--snp_tab {input.snp_tab} "
@@ -117,8 +156,7 @@ rule find_intersecting_snps:
 rule map_STAR2:
     """map reads a second time using STAR"""
     input:
-        fastq1 = rules.find_intersecting_snps.output.fastq1,
-        fastq2 = rules.find_intersecting_snps.output.fastq2,
+        fastq = rules.find_intersecting_snps.output.fastq,
         reference = config['ref_genome_star'] if 'ref_genome_star' in config and \
             config['ref_genome_star'] else rules.create_STAR_index.output
     params:
@@ -132,7 +170,7 @@ rule map_STAR2:
     shell:
         "STAR --runThreadN 1 "
             "--genomeDir {input.reference} "
-            "--readFilesIn {input.fastq1} {input.fastq2} "
+            "--readFilesIn {input.fastq} "
             "--readFilesCommand zcat "
             "--outSAMtype BAM Unsorted "
             "--alignEndsType EndToEnd "
@@ -141,17 +179,16 @@ rule map_STAR2:
 rule map_BWA2:
     """map ATAC-seq reads using BWA-MEM"""
     input:
-        fastq1 = rules.find_intersecting_snps.output.fastq1,
-        fastq2 = rules.find_intersecting_snps.output.fastq2,
-        ref = config['ref_genome']
+        fastq = rules.find_intersecting_snps.output.fastq,
+        ref = lambda wildcards: config['ref_genome']
     output:
         config['output_dir'] + "/map2/{sample}/aln.bam"
     conda: "../envs/default.yaml"
     benchmark: config['output_dir'] + "/benchmark/WASP/map_BWA2/{sample}.tsv"
     resources:
         mem_mb = 30000
     shell:
-        "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}"
+        "bwa mem {input.ref} {input.fastq} > {output}"
 
 rule sort_and_index_bam2:
     """sort and index bam generated by second mapping step"""
@@ -207,7 +244,8 @@ rule rmdup:
     input:
         bam = rules.sort_filtered_bam.output.bam,
         index = rules.sort_filtered_bam.output.index,
-        rmdup_script = rules.get_WASP.output.rmdup_script
+        rmdup_script = rules.get_WASP.output.rmdup_script if 'unpaired' in config and \
+            config['unpaired'] else rules.get_WASP.output.rmdup_pe_script
     output:
         rmdup = temp(config['output_dir'] + "/rmdup/{sample}.keep.rmdup.bam"),
         sort = config['output_dir'] + "/rmdup/{sample}.keep.rmdup.sort.bam",

diff --git a/Snakefiles/Snakefile-counts b/Snakefiles/Snakefile-counts
@@ -89,6 +89,9 @@ rule get_as_counts:
         bam = lambda wildcards: SAMP3[wildcards.sample][
             wildcards.type == "rna" and not config['rna_only']
         ],
+        bam_idx = lambda wildcards: SAMP3[wildcards.sample][
+            wildcards.type == "rna" and not config['rna_only']
+        ]+".bai",
         snp_index = rules.vcf2h5.output.snp_index,
         snp_tab = rules.vcf2h5.output.snp_tab,
         haplotype = rules.vcf2h5.output.haplotype,

diff --git a/Snakefiles/snp2h5_rules.smk b/Snakefiles/snp2h5_rules.smk
@@ -12,7 +12,8 @@ rule get_WASP:
     output:
         find_intersecting_snps_script = config['wasp_dir'] + "/mapping/find_intersecting_snps.py",
         filter_remapped_reads_script = config['wasp_dir'] + "/mapping/filter_remapped_reads.py",
-        rmdup_script = config['wasp_dir'] + "/mapping/rmdup_pe.py",
+        rmdup_script = config['wasp_dir'] + "/mapping/rmdup.py",
+        rmdup_pe_script = config['wasp_dir'] + "/mapping/rmdup_pe.py",
         bam2h5_script = config['wasp_dir'] + "/CHT/bam2h5.py",
         makefile = config['wasp_dir'] + "/snp2h5/Makefile"
     conda: "../envs/default.yaml"

diff --git a/configs/config-WASP.yaml b/configs/config-WASP.yaml
@@ -12,6 +12,12 @@
 # conservative version of the ASE test (see the 'rna_only' config below)
 "sample_file" : "data/samples.tsv"
 
+# If your RNA-seq data is single-end instead of paired-end, set this config
+# to true. In that case, you can provide only one FASTQ file in each line of
+# the samples file above. Otherwise, if this option is set to a falsey value
+# or commented out, both FASTQs will be used.
+"unpaired": false
+
 # The path to a reference genome
 # This is not required if "ref_genome_star" is provided below
 # Note: the FASTA file should not be gzipped

diff --git a/run b/run
@@ -67,7 +67,8 @@ if command -v 'slack' &>/dev/null; then
 	if [ "$exit_code" -eq 0 ]; then
 		slack "snakemake finished successfully" &>/dev/null
 	else
-		slack "$(tail -n4 "$out_path/log")" &>/dev/null
+        slack "snakemake as_analysis job failed" &>/dev/null
+        slack "$(tail -n4 "$out_path/log")" &>/dev/null
 	fi
 fi
 exit "$exit_code"