From 2669d0790b59400d2220e5e2b3b698d95db9e810 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 08:10:31 -0800 Subject: [PATCH 1/7] implement the quick and dirty solution to issue #65: the pipeline will require that you give it two FASTQ files in the samples file, but it will only use the first one support for paired end reads is now broken; you can only do single end reads --- Snakefiles/Snakefile-WASP | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP index 572f970..6841ae0 100644 --- a/Snakefiles/Snakefile-WASP +++ b/Snakefiles/Snakefile-WASP @@ -31,7 +31,7 @@ rule map_STAR1: """map reads using STAR""" input: fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], - fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], +# fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], index = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -45,7 +45,7 @@ rule map_STAR1: shell: "STAR --runThreadN 1 " "--genomeDir {input.index} " - "--readFilesIn {input.fastq1} {input.fastq2} " + "--readFilesIn {input.fastq1} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -55,7 +55,7 @@ rule map_BWA1: """map ATAC-seq reads using BWA-MEM""" input: fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], - fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], +# fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], ref = config['ref_genome'] output: config['output_dir'] + "/map1/{sample}/aln.bam" @@ -64,7 +64,7 @@ rule map_BWA1: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}" + "bwa mem {input.ref} {input.fastq1} > {output}" rule sort_and_index_bam1: """sort and index bam generated by first mapping step""" @@ -96,7 +96,7 @@ rule find_intersecting_snps: output_dir = lambda wildcards, output: str(Path(output.fastq1).parent) output: fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz", - fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz", +# fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz", keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"), remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam" conda: "../envs/default.yaml" @@ -105,7 +105,7 @@ rule find_intersecting_snps: mem_mb = 10000 shell: "python {input.find_intersecting_snps_script} " - "--is_paired_end " +# "--is_paired_end " "--is_sorted " "--output_dir {params.output_dir} " "--snp_tab {input.snp_tab} " @@ -118,7 +118,7 @@ rule map_STAR2: """map reads a second time using STAR""" input: fastq1 = rules.find_intersecting_snps.output.fastq1, - fastq2 = rules.find_intersecting_snps.output.fastq2, +# fastq2 = rules.find_intersecting_snps.output.fastq2, reference = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -132,7 +132,7 @@ rule map_STAR2: shell: "STAR --runThreadN 1 " "--genomeDir {input.reference} " - "--readFilesIn {input.fastq1} {input.fastq2} " + "--readFilesIn {input.fastq1} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -142,7 +142,7 @@ rule map_BWA2: """map ATAC-seq reads using BWA-MEM""" input: fastq1 = rules.find_intersecting_snps.output.fastq1, - fastq2 = rules.find_intersecting_snps.output.fastq2, +# fastq2 = rules.find_intersecting_snps.output.fastq2, ref = config['ref_genome'] output: config['output_dir'] + "/map2/{sample}/aln.bam" @@ -151,7 +151,7 @@ rule map_BWA2: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}" + "bwa mem {input.ref} {input.fastq1} > {output}" rule sort_and_index_bam2: """sort and index bam generated by second mapping step""" From d6a7e5a1c1e28b944e871b8c92c05ad62010a736 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 13:45:28 -0800 Subject: [PATCH 2/7] rename output from unpaired find_intersecting_snps --- Snakefiles/Snakefile-WASP | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP index 6841ae0..54b60d3 100644 --- a/Snakefiles/Snakefile-WASP +++ b/Snakefiles/Snakefile-WASP @@ -95,7 +95,7 @@ rule find_intersecting_snps: sample_names = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample], output_dir = lambda wildcards, output: str(Path(output.fastq1).parent) output: - fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz", + fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz", # fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz", keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"), remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam" From 2c09708845aee8c448d279dc4a7298b088753236 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 21:11:38 -0800 Subject: [PATCH 3/7] use rmdup.py script instead of rmdup_pe.py with unpaired reads in wasp subworkflow --- Snakefiles/snp2h5_rules.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Snakefiles/snp2h5_rules.smk b/Snakefiles/snp2h5_rules.smk index 7cd2c9e..2dea912 100644 --- a/Snakefiles/snp2h5_rules.smk +++ b/Snakefiles/snp2h5_rules.smk @@ -12,7 +12,8 @@ rule get_WASP: output: find_intersecting_snps_script = config['wasp_dir'] + "/mapping/find_intersecting_snps.py", filter_remapped_reads_script = config['wasp_dir'] + "/mapping/filter_remapped_reads.py", - rmdup_script = config['wasp_dir'] + "/mapping/rmdup_pe.py", + rmdup_script = config['wasp_dir'] + "/mapping/rmdup.py", + rmdup_pe_script = config['wasp_dir'] + "/mapping/rmdup_pe.py", bam2h5_script = config['wasp_dir'] + "/CHT/bam2h5.py", makefile = config['wasp_dir'] + "/snp2h5/Makefile" conda: "../envs/default.yaml" From 0297c3a8e2c8d7d2a581ef9e1e8eae8472145c8d Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 21:59:56 -0800 Subject: [PATCH 4/7] add support for single end reads in wasp subworkflow (resolves #65) --- Snakefile | 16 ++++++++++++---- Snakefiles/Snakefile-WASP | 35 ++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/Snakefile b/Snakefile index bf2376a..4dc5c2f 100644 --- a/Snakefile +++ b/Snakefile @@ -12,6 +12,7 @@ def check_config(value, default=False, place=config): return place[value] if (value in place and place[value]) else default no_variant_calling = check_config('vcf_file') +unpaired = check_config('unpaired') def read_samples(): """Function to get names and dna/rna fastq paths from a sample file @@ -23,17 +24,24 @@ def read_samples(): or (optionally) just these four: + Or if the unpaired config option is also provided, the input file + is expected to have these four columns: + + or (optionally) just these three: + Modify this function as needed to provide the correct dictionary.""" f = open(config['sample_file'], "r") samp_dict = {} for line in f: words = line.strip().split("\t") if no_variant_calling: - if len(words) == 5: - samp_dict[words[1]] = (words[2], (words[3], words[4])) - elif len(words) == 4: + if len(words) == (5-unpaired): + fastq = words[3] if unpaired else (words[3], words[4]) + samp_dict[words[1]] = (words[2], fastq) + elif len(words) == (4-unpaired): # then the dna_bam_path hasn't been specified - samp_dict[words[1]] = ((words[2], words[3]),) + fastq = words[2] if unpaired else (words[2], words[3]) + samp_dict[words[1]] = (fastq,) # sanity check to make sure that rna_only is set to true config['rna_only'] = True else: diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP index 54b60d3..8c52091 100644 --- a/Snakefiles/Snakefile-WASP +++ b/Snakefiles/Snakefile-WASP @@ -30,8 +30,7 @@ rule create_STAR_index: rule map_STAR1: """map reads using STAR""" input: - fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], -# fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], + fastq = lambda wildcards: SAMP2[wildcards.sample], index = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -45,7 +44,7 @@ rule map_STAR1: shell: "STAR --runThreadN 1 " "--genomeDir {input.index} " - "--readFilesIn {input.fastq1} " + "--readFilesIn {input.fastq} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -54,8 +53,7 @@ rule map_STAR1: rule map_BWA1: """map ATAC-seq reads using BWA-MEM""" input: - fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], -# fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], + fastq = lambda wildcards: SAMP2[wildcards.sample], ref = config['ref_genome'] output: config['output_dir'] + "/map1/{sample}/aln.bam" @@ -64,7 +62,7 @@ rule map_BWA1: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} > {output}" + "bwa mem {input.ref} {input.fastq} > {output}" rule sort_and_index_bam1: """sort and index bam generated by first mapping step""" @@ -93,10 +91,14 @@ rule find_intersecting_snps: find_intersecting_snps_script = rules.get_WASP.output.find_intersecting_snps_script params: sample_names = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample], - output_dir = lambda wildcards, output: str(Path(output.fastq1).parent) + output_dir = lambda wildcards, output: str(Path(output.remap_bam).parent), + is_paired = "" if 'unpaired' in config and config['unpaired'] else "--is_paired_end " output: - fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz", -# fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz", + fastq = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz" \ + if 'unpaired' in config and config['unpaired'] else [ + config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz", + config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz" + ], keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"), remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam" conda: "../envs/default.yaml" @@ -105,7 +107,7 @@ rule find_intersecting_snps: mem_mb = 10000 shell: "python {input.find_intersecting_snps_script} " -# "--is_paired_end " + "{params.is_paired}" "--is_sorted " "--output_dir {params.output_dir} " "--snp_tab {input.snp_tab} " @@ -117,8 +119,7 @@ rule find_intersecting_snps: rule map_STAR2: """map reads a second time using STAR""" input: - fastq1 = rules.find_intersecting_snps.output.fastq1, -# fastq2 = rules.find_intersecting_snps.output.fastq2, + fastq = rules.find_intersecting_snps.output.fastq, reference = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -132,7 +133,7 @@ rule map_STAR2: shell: "STAR --runThreadN 1 " "--genomeDir {input.reference} " - "--readFilesIn {input.fastq1} " + "--readFilesIn {input.fastq} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -141,8 +142,7 @@ rule map_STAR2: rule map_BWA2: """map ATAC-seq reads using BWA-MEM""" input: - fastq1 = rules.find_intersecting_snps.output.fastq1, -# fastq2 = rules.find_intersecting_snps.output.fastq2, + fastq = rules.find_intersecting_snps.output.fastq, ref = config['ref_genome'] output: config['output_dir'] + "/map2/{sample}/aln.bam" @@ -151,7 +151,7 @@ rule map_BWA2: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} > {output}" + "bwa mem {input.ref} {input.fastq} > {output}" rule sort_and_index_bam2: """sort and index bam generated by second mapping step""" @@ -207,7 +207,8 @@ rule rmdup: input: bam = rules.sort_filtered_bam.output.bam, index = rules.sort_filtered_bam.output.index, - rmdup_script = rules.get_WASP.output.rmdup_script + rmdup_script = rules.get_WASP.output.rmdup_script if 'unpaired' in config and \ + config['unpaired'] else rules.get_WASP.output.rmdup_pe_script output: rmdup = temp(config['output_dir'] + "/rmdup/{sample}.keep.rmdup.bam"), sort = config['output_dir'] + "/rmdup/{sample}.keep.rmdup.sort.bam", From e5e1bb1178002989f73330272867051c7b4b1b56 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 22:04:51 -0800 Subject: [PATCH 5/7] send another slack message just in case the other one doesn't work --- run | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run b/run index 3b01bd2..eb7ca53 100755 --- a/run +++ b/run @@ -67,7 +67,8 @@ if command -v 'slack' &>/dev/null; then if [ "$exit_code" -eq 0 ]; then slack "snakemake finished successfully" &>/dev/null else - slack "$(tail -n4 "$out_path/log")" &>/dev/null + slack "snakemake as_analysis job failed" &>/dev/null + slack "$(tail -n4 "$out_path/log")" &>/dev/null fi fi exit "$exit_code" From 540728c2cbbccabf177fe55874d639a0b97a89fb Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 19 Jan 2021 22:06:12 -0800 Subject: [PATCH 6/7] add unpaired config option to wasp subworkflow (see #65) --- configs/config-WASP.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/configs/config-WASP.yaml b/configs/config-WASP.yaml index 2fe5447..5b39317 100644 --- a/configs/config-WASP.yaml +++ b/configs/config-WASP.yaml @@ -12,6 +12,12 @@ # conservative version of the ASE test (see the 'rna_only' config below) "sample_file" : "data/samples.tsv" +# If your RNA-seq data is single-end instead of paired-end, set this config +# to true. In that case, you can provide only one FASTQ file in each line of +# the samples file above. Otherwise, if this option is set to a falsey value +# or commented out, both FASTQs will be used. +"unpaired": false + # The path to a reference genome # This is not required if "ref_genome_star" is provided below # Note: the FASTA file should not be gzipped From a385a4be10ef311f27a776f9d9deaa1525b1ac95 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Tue, 23 Mar 2021 12:46:36 -0700 Subject: [PATCH 7/7] start on work for un-gzipping STAR files (see #64) still needs to be tested --- Snakefiles/Snakefile-WASP | 45 +++++++++++++++++++++++++++++++++---- Snakefiles/Snakefile-counts | 3 +++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP index 8c52091..0dcb404 100644 --- a/Snakefiles/Snakefile-WASP +++ b/Snakefiles/Snakefile-WASP @@ -8,11 +8,35 @@ min_version("5.18.0") include: "snp2h5_rules.smk" +rule uncompress_ref: + """temporarily uncompress gzipped files for the STAR index""" + input: + ref = lambda wildcards: config['ref_genome'] + output: + ref = temp(config['output_dir'] + "/ref_genome/"+Path(config['ref_genome']).stem) + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_ref/all.tsv" + shell: + "gunzip -c {input.ref} > {output.ref}" + +rule uncompress_genes: + """temporarily uncompress gzipped files for the STAR index""" + input: + genes = config['gene_info'] + output: + genes = temp(config['output_dir'] + "/ref_genome/"+Path(config['gene_info']).stem) + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_genes/all.tsv" + shell: + "gunzip -c {input.genes} > {output.genes}" + rule create_STAR_index: """create an index of the reference genome using STAR""" input: - ref = lambda wildcards: config['ref_genome'], - genes = config['gene_info'] + ref = lambda wildcards: rules.uncompress_ref.output.ref if \ + config['ref_genome'].endswith('.gz') else config['ref_genome'], + genes = rules.uncompress_genes.output.genes if \ + config['gene_info'].endswith('.gz') else config['gene_info'] params: overhang = config['sjdbOverhang'] if 'sjdbOverhang' in config and \ config['sjdbOverhang'] else 100 @@ -27,6 +51,19 @@ rule create_STAR_index: "--sjdbOverhang {params.overhang} --runMode genomeGenerate " "--genomeFastaFiles {input.ref} --sjdbGTFfile {input.genes}" +rule create_BWA_index: + """create an index of the reference genome using BWA""" + input: + ref = lambda wildcards: config['ref_genome'] + params: + ref = lambda wildcards, output: output[0] + "/" + Path(config['ref_genome']).name + output: directory(config['output_dir'] + "/ref_genome/bwa_index") + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/create_BWA_index/all.tsv" + shell: + "ln -sf {input.ref} {params.ref} && " + "bwa index {params.ref}" + rule map_STAR1: """map reads using STAR""" input: @@ -54,7 +91,7 @@ rule map_BWA1: """map ATAC-seq reads using BWA-MEM""" input: fastq = lambda wildcards: SAMP2[wildcards.sample], - ref = config['ref_genome'] + ref = lambda wildcards: config['ref_genome'] output: config['output_dir'] + "/map1/{sample}/aln.bam" conda: "../envs/default.yaml" @@ -143,7 +180,7 @@ rule map_BWA2: """map ATAC-seq reads using BWA-MEM""" input: fastq = rules.find_intersecting_snps.output.fastq, - ref = config['ref_genome'] + ref = lambda wildcards: config['ref_genome'] output: config['output_dir'] + "/map2/{sample}/aln.bam" conda: "../envs/default.yaml" diff --git a/Snakefiles/Snakefile-counts b/Snakefiles/Snakefile-counts index f34854b..82437b6 100644 --- a/Snakefiles/Snakefile-counts +++ b/Snakefiles/Snakefile-counts @@ -89,6 +89,9 @@ rule get_as_counts: bam = lambda wildcards: SAMP3[wildcards.sample][ wildcards.type == "rna" and not config['rna_only'] ], + bam_idx = lambda wildcards: SAMP3[wildcards.sample][ + wildcards.type == "rna" and not config['rna_only'] + ]+".bai", snp_index = rules.vcf2h5.output.snp_index, snp_tab = rules.vcf2h5.output.snp_tab, haplotype = rules.vcf2h5.output.haplotype,