add tophat2

Mpaperlee · Jun 16, 2017 · 128c7a1 · 128c7a1
1 parent 409808c
commit 128c7a1
Show file tree

Hide file tree

Showing 9 changed files with 267 additions and 176 deletions.
diff --git a/CONFIG/hisat2.yaml b/CONFIG/hisat2.yaml
@@ -1,9 +1,12 @@
+#custom scripts
+cufflinks_filter: bin/cufflinks/filt_gtf_after_cufflinks.py
 #sample
 datapath: 
  samplepath: /home/jiajinbu/nas3/zhouran/proj/10-24-Pi-potato/raw_data
  gtfpath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/hisat2/genome.gtf
  genomepath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/potato_dm_v404_all_pm_un.fasta
  indexpath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/hisat2/potato
+ bowtie2_index:
 #soft
 trimmomatic: /home/jiajinbu/bu/soft/bio/trimmomatic/Trimmomatic-0.36/trimmomatic-0.36.jar
 trans_cds: /home/jiajinbu/bu/soft/bio/transdecoder/TransDecoder-3.0.0/util/cufflinks_gtf_genome_to_cdna_fasta.pl
@@ -17,12 +20,16 @@ outputpath:
  gffcompare: data/gffcompare/
  ballgown: data/ballgown/
  transdecoder: data/transdecoder/
+#threads
+thread: 
+ hisat2: 8
+ tophat2: 12
+ cuff: 8
 #hisat2
 Min_intron: 20
 Max_intron: 10000
 Strand_specific: "\u0020"
 #or --rna-strandness FR
-thread: 8
 Hisat2_splicesite: "\u0020"
 #if there is a splicesite file in index,so it's blank [--known-splicesite-infile yourfile]
 

diff --git a/RNAseq.snakefile b/RNAseq.snakefile
@@ -4,25 +4,32 @@ samplepath        = config['datapath']['sample_path']
 genome            = config['datapath']['genome_path']
 gtf               = config['datapath']['gtf_path']
 trimmomatic       = config['trimmomatic']
+
+thread_his		  = config['thread']['hisat2']
+thread_top		  = config['thread']['tophat2']
+thread_cuf		  = config['thread']['cuff']
+
+##tophat2
+bowtie2_index	  = config['datapath']['bowtie2_index']
 ##hisat2
 Hisat2_splicesite = config['Hisat2_splicesite']
 index             = config['datapath']['index_path']
 Max_intron        = config['Max_intron']
 Min_intron        = config['Min_intron']
-thread            = config['thread']
 Strand_specific   = config['Strand_specific']
 ##cufflink
 cuff_lib_type     = config['cuff_lib_type']
-SAMPLE_LIST       = ['A1', 'F4', 'P3', 'Pa', 'Pc', 'Pd', 'Pt13', 'Pt']
-LABLES_string     = []
-for i in sample:
-    a = "data/cuffquant/%s/abundances.cxb" % i
-    LABLES_string.append(a)
+# SAMPLE_LIST       = ['A1', 'F4', 'P3', 'Pa', 'Pc', 'Pd', 'Pt13', 'Pt']
+# LABLES_string     = []
+# for i in sample:
+#     a = "data/cuffquant/%s/abundances.cxb" % i
+#     LABLES_string.append(a)
 #print (LABLES_string)
 #print(Sample)
 ###############outpath################
 tri_path          = config['outputpath']['trimmomatic']
 hisat2_path       = config['outputpath']['hisat2']
+tophat2_path	  = config['outputpath']['tophat2']
 stringtie_path    = config['outputpath']['stringtie']
 cuffquant_path    = config['outputpath']['cuffquant']
 cuffdiff_path     = config['outputpath']['cuffdiff']

diff --git a/bin/cufflinks/filt_gtf_after_cufflinks.py b/bin/cufflinks/filt_gtf_after_cufflinks.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+import argparse
+import sys
+import re
+from itertools import *
+parser = argparse.ArgumentParser(description="Filter the isoforms by the cov(2), exon numbers(2) and length(200). Use this script after cufflinks as a subsitution for next analysis")
+parser.add_argument("-g","--gtf", action="store", dest="gtf",type=str, required=True, 
+                    help="GTF file generated by cufflinks ")
+parser.add_argument("-i","--isoforms", action="store",type = str, dest="isoforms", required=True,\
+                     help="isoforms.fpkm_tracking file generated by cufflinks")
+parser.add_argument("-o","--out", action="store",type = str, dest="out", default='filter.gtf',\
+                    help="the name of the output file")
+argvs = parser.parse_args()
+LEN = 200
+COV = 2
+NUM = 2
+
+def filter_dic(dic):
+    for x in dic.keys():
+        if float(dic[x]['trans_len']) < LEN or float(dic[x]['trans_cov']) < COV or float(dic[x]['exon']) < NUM:
+            del dic[x]
+        else:pass
+    return dic
+with open(argvs.isoforms) as IN1,open(argvs.gtf) as IN2:
+        IN1_dic = {}
+        for i in islice(IN1,1,None):
+            ALL = i.strip().split('\t')
+            IN1_dic[ALL[0]] = {}
+            IN1_dic[ALL[0]]['trans_len'] = ALL[7]
+            IN1_dic[ALL[0]]['trans_cov'] = ALL[8]
+        for i in IN2.readlines():
+            trans_id = re.findall(r'transcript_id "(.*?)";',i)[0]
+            if i.strip().split('\t')[2] == 'transcript':
+                IN1_dic[trans_id]['exon'] = 0
+            elif i.strip().split('\t')[2] == 'exon':
+                IN1_dic[trans_id]['exon'] += 1
+        IN1_dic = filter_dic(IN1_dic)
+with open(argvs.out,'w') as OUT ,open(argvs.gtf) as IN2:
+    for i in IN2.readlines():
+        ID = re.findall(r'transcript_id "(.*?)";',i)[0]
+        if ID in IN1_dic:
+            OUT.write(i)
+        else:pass