diff --git a/Snakefile b/Snakefile index bf2376a..4dc5c2f 100644 --- a/Snakefile +++ b/Snakefile @@ -12,6 +12,7 @@ def check_config(value, default=False, place=config): return place[value] if (value in place and place[value]) else default no_variant_calling = check_config('vcf_file') +unpaired = check_config('unpaired') def read_samples(): """Function to get names and dna/rna fastq paths from a sample file @@ -23,17 +24,24 @@ def read_samples(): or (optionally) just these four: + Or if the unpaired config option is also provided, the input file + is expected to have these four columns: + + or (optionally) just these three: + Modify this function as needed to provide the correct dictionary.""" f = open(config['sample_file'], "r") samp_dict = {} for line in f: words = line.strip().split("\t") if no_variant_calling: - if len(words) == 5: - samp_dict[words[1]] = (words[2], (words[3], words[4])) - elif len(words) == 4: + if len(words) == (5-unpaired): + fastq = words[3] if unpaired else (words[3], words[4]) + samp_dict[words[1]] = (words[2], fastq) + elif len(words) == (4-unpaired): # then the dna_bam_path hasn't been specified - samp_dict[words[1]] = ((words[2], words[3]),) + fastq = words[2] if unpaired else (words[2], words[3]) + samp_dict[words[1]] = (fastq,) # sanity check to make sure that rna_only is set to true config['rna_only'] = True else: diff --git a/Snakefiles/Snakefile-WASP b/Snakefiles/Snakefile-WASP index 572f970..0dcb404 100644 --- a/Snakefiles/Snakefile-WASP +++ b/Snakefiles/Snakefile-WASP @@ -8,11 +8,35 @@ min_version("5.18.0") include: "snp2h5_rules.smk" +rule uncompress_ref: + """temporarily uncompress gzipped files for the STAR index""" + input: + ref = lambda wildcards: config['ref_genome'] + output: + ref = temp(config['output_dir'] + "/ref_genome/"+Path(config['ref_genome']).stem) + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_ref/all.tsv" + shell: + "gunzip -c {input.ref} > {output.ref}" + +rule uncompress_genes: + """temporarily uncompress gzipped files for the STAR index""" + input: + genes = config['gene_info'] + output: + genes = temp(config['output_dir'] + "/ref_genome/"+Path(config['gene_info']).stem) + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/uncompress_genes/all.tsv" + shell: + "gunzip -c {input.genes} > {output.genes}" + rule create_STAR_index: """create an index of the reference genome using STAR""" input: - ref = lambda wildcards: config['ref_genome'], - genes = config['gene_info'] + ref = lambda wildcards: rules.uncompress_ref.output.ref if \ + config['ref_genome'].endswith('.gz') else config['ref_genome'], + genes = rules.uncompress_genes.output.genes if \ + config['gene_info'].endswith('.gz') else config['gene_info'] params: overhang = config['sjdbOverhang'] if 'sjdbOverhang' in config and \ config['sjdbOverhang'] else 100 @@ -27,11 +51,23 @@ rule create_STAR_index: "--sjdbOverhang {params.overhang} --runMode genomeGenerate " "--genomeFastaFiles {input.ref} --sjdbGTFfile {input.genes}" +rule create_BWA_index: + """create an index of the reference genome using BWA""" + input: + ref = lambda wildcards: config['ref_genome'] + params: + ref = lambda wildcards, output: output[0] + "/" + Path(config['ref_genome']).name + output: directory(config['output_dir'] + "/ref_genome/bwa_index") + conda: "../envs/default.yaml" + benchmark: config['output_dir'] + "/benchmark/WASP/create_BWA_index/all.tsv" + shell: + "ln -sf {input.ref} {params.ref} && " + "bwa index {params.ref}" + rule map_STAR1: """map reads using STAR""" input: - fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], - fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], + fastq = lambda wildcards: SAMP2[wildcards.sample], index = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -45,7 +81,7 @@ rule map_STAR1: shell: "STAR --runThreadN 1 " "--genomeDir {input.index} " - "--readFilesIn {input.fastq1} {input.fastq2} " + "--readFilesIn {input.fastq} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -54,9 +90,8 @@ rule map_STAR1: rule map_BWA1: """map ATAC-seq reads using BWA-MEM""" input: - fastq1 = lambda wildcards: SAMP2[wildcards.sample][0], - fastq2 = lambda wildcards: SAMP2[wildcards.sample][1], - ref = config['ref_genome'] + fastq = lambda wildcards: SAMP2[wildcards.sample], + ref = lambda wildcards: config['ref_genome'] output: config['output_dir'] + "/map1/{sample}/aln.bam" conda: "../envs/default.yaml" @@ -64,7 +99,7 @@ rule map_BWA1: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}" + "bwa mem {input.ref} {input.fastq} > {output}" rule sort_and_index_bam1: """sort and index bam generated by first mapping step""" @@ -93,10 +128,14 @@ rule find_intersecting_snps: find_intersecting_snps_script = rules.get_WASP.output.find_intersecting_snps_script params: sample_names = lambda wildcards: SAMP_TO_VCF_ID[wildcards.sample], - output_dir = lambda wildcards, output: str(Path(output.fastq1).parent) + output_dir = lambda wildcards, output: str(Path(output.remap_bam).parent), + is_paired = "" if 'unpaired' in config and config['unpaired'] else "--is_paired_end " output: - fastq1 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz", - fastq2 = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz", + fastq = config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq.gz" \ + if 'unpaired' in config and config['unpaired'] else [ + config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq1.gz", + config['output_dir'] + "/find_intersecting_snps/{sample}.remap.fq2.gz" + ], keep_bam = temp(config['output_dir'] + "/find_intersecting_snps/{sample}.keep.bam"), remap_bam = config['output_dir'] + "/find_intersecting_snps/{sample}.to.remap.bam" conda: "../envs/default.yaml" @@ -105,7 +144,7 @@ rule find_intersecting_snps: mem_mb = 10000 shell: "python {input.find_intersecting_snps_script} " - "--is_paired_end " + "{params.is_paired}" "--is_sorted " "--output_dir {params.output_dir} " "--snp_tab {input.snp_tab} " @@ -117,8 +156,7 @@ rule find_intersecting_snps: rule map_STAR2: """map reads a second time using STAR""" input: - fastq1 = rules.find_intersecting_snps.output.fastq1, - fastq2 = rules.find_intersecting_snps.output.fastq2, + fastq = rules.find_intersecting_snps.output.fastq, reference = config['ref_genome_star'] if 'ref_genome_star' in config and \ config['ref_genome_star'] else rules.create_STAR_index.output params: @@ -132,7 +170,7 @@ rule map_STAR2: shell: "STAR --runThreadN 1 " "--genomeDir {input.reference} " - "--readFilesIn {input.fastq1} {input.fastq2} " + "--readFilesIn {input.fastq} " "--readFilesCommand zcat " "--outSAMtype BAM Unsorted " "--alignEndsType EndToEnd " @@ -141,9 +179,8 @@ rule map_STAR2: rule map_BWA2: """map ATAC-seq reads using BWA-MEM""" input: - fastq1 = rules.find_intersecting_snps.output.fastq1, - fastq2 = rules.find_intersecting_snps.output.fastq2, - ref = config['ref_genome'] + fastq = rules.find_intersecting_snps.output.fastq, + ref = lambda wildcards: config['ref_genome'] output: config['output_dir'] + "/map2/{sample}/aln.bam" conda: "../envs/default.yaml" @@ -151,7 +188,7 @@ rule map_BWA2: resources: mem_mb = 30000 shell: - "bwa mem {input.ref} {input.fastq1} {input.fastq2} > {output}" + "bwa mem {input.ref} {input.fastq} > {output}" rule sort_and_index_bam2: """sort and index bam generated by second mapping step""" @@ -207,7 +244,8 @@ rule rmdup: input: bam = rules.sort_filtered_bam.output.bam, index = rules.sort_filtered_bam.output.index, - rmdup_script = rules.get_WASP.output.rmdup_script + rmdup_script = rules.get_WASP.output.rmdup_script if 'unpaired' in config and \ + config['unpaired'] else rules.get_WASP.output.rmdup_pe_script output: rmdup = temp(config['output_dir'] + "/rmdup/{sample}.keep.rmdup.bam"), sort = config['output_dir'] + "/rmdup/{sample}.keep.rmdup.sort.bam", diff --git a/Snakefiles/Snakefile-counts b/Snakefiles/Snakefile-counts index f34854b..82437b6 100644 --- a/Snakefiles/Snakefile-counts +++ b/Snakefiles/Snakefile-counts @@ -89,6 +89,9 @@ rule get_as_counts: bam = lambda wildcards: SAMP3[wildcards.sample][ wildcards.type == "rna" and not config['rna_only'] ], + bam_idx = lambda wildcards: SAMP3[wildcards.sample][ + wildcards.type == "rna" and not config['rna_only'] + ]+".bai", snp_index = rules.vcf2h5.output.snp_index, snp_tab = rules.vcf2h5.output.snp_tab, haplotype = rules.vcf2h5.output.haplotype, diff --git a/Snakefiles/snp2h5_rules.smk b/Snakefiles/snp2h5_rules.smk index 7cd2c9e..2dea912 100644 --- a/Snakefiles/snp2h5_rules.smk +++ b/Snakefiles/snp2h5_rules.smk @@ -12,7 +12,8 @@ rule get_WASP: output: find_intersecting_snps_script = config['wasp_dir'] + "/mapping/find_intersecting_snps.py", filter_remapped_reads_script = config['wasp_dir'] + "/mapping/filter_remapped_reads.py", - rmdup_script = config['wasp_dir'] + "/mapping/rmdup_pe.py", + rmdup_script = config['wasp_dir'] + "/mapping/rmdup.py", + rmdup_pe_script = config['wasp_dir'] + "/mapping/rmdup_pe.py", bam2h5_script = config['wasp_dir'] + "/CHT/bam2h5.py", makefile = config['wasp_dir'] + "/snp2h5/Makefile" conda: "../envs/default.yaml" diff --git a/configs/config-WASP.yaml b/configs/config-WASP.yaml index 2fe5447..5b39317 100644 --- a/configs/config-WASP.yaml +++ b/configs/config-WASP.yaml @@ -12,6 +12,12 @@ # conservative version of the ASE test (see the 'rna_only' config below) "sample_file" : "data/samples.tsv" +# If your RNA-seq data is single-end instead of paired-end, set this config +# to true. In that case, you can provide only one FASTQ file in each line of +# the samples file above. Otherwise, if this option is set to a falsey value +# or commented out, both FASTQs will be used. +"unpaired": false + # The path to a reference genome # This is not required if "ref_genome_star" is provided below # Note: the FASTA file should not be gzipped diff --git a/run b/run index 3b01bd2..eb7ca53 100755 --- a/run +++ b/run @@ -67,7 +67,8 @@ if command -v 'slack' &>/dev/null; then if [ "$exit_code" -eq 0 ]; then slack "snakemake finished successfully" &>/dev/null else - slack "$(tail -n4 "$out_path/log")" &>/dev/null + slack "snakemake as_analysis job failed" &>/dev/null + slack "$(tail -n4 "$out_path/log")" &>/dev/null fi fi exit "$exit_code"