Skip to content

Commit

Permalink
add tophat2
Browse files Browse the repository at this point in the history
  • Loading branch information
zhou-ran committed Jun 16, 2017
1 parent 409808c commit 128c7a1
Show file tree
Hide file tree
Showing 9 changed files with 267 additions and 176 deletions.
9 changes: 8 additions & 1 deletion CONFIG/hisat2.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
#custom scripts
cufflinks_filter: bin/cufflinks/filt_gtf_after_cufflinks.py
#sample
datapath:
samplepath: /home/jiajinbu/nas3/zhouran/proj/10-24-Pi-potato/raw_data
gtfpath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/hisat2/genome.gtf
genomepath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/potato_dm_v404_all_pm_un.fasta
indexpath: /home/jiajinbu/bu/data/genome/Solanum_tuberosum_potato/2014/hisat2/potato
bowtie2_index:
#soft
trimmomatic: /home/jiajinbu/bu/soft/bio/trimmomatic/Trimmomatic-0.36/trimmomatic-0.36.jar
trans_cds: /home/jiajinbu/bu/soft/bio/transdecoder/TransDecoder-3.0.0/util/cufflinks_gtf_genome_to_cdna_fasta.pl
Expand All @@ -17,12 +20,16 @@ outputpath:
gffcompare: data/gffcompare/
ballgown: data/ballgown/
transdecoder: data/transdecoder/
#threads
thread:
hisat2: 8
tophat2: 12
cuff: 8
#hisat2
Min_intron: 20
Max_intron: 10000
Strand_specific: "\u0020"
#or --rna-strandness FR
thread: 8
Hisat2_splicesite: "\u0020"
#if there is a splicesite file in index,so it's blank [--known-splicesite-infile yourfile]

Expand Down
19 changes: 13 additions & 6 deletions RNAseq.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,32 @@ samplepath = config['datapath']['sample_path']
genome = config['datapath']['genome_path']
gtf = config['datapath']['gtf_path']
trimmomatic = config['trimmomatic']

thread_his = config['thread']['hisat2']
thread_top = config['thread']['tophat2']
thread_cuf = config['thread']['cuff']

##tophat2
bowtie2_index = config['datapath']['bowtie2_index']
##hisat2
Hisat2_splicesite = config['Hisat2_splicesite']
index = config['datapath']['index_path']
Max_intron = config['Max_intron']
Min_intron = config['Min_intron']
thread = config['thread']
Strand_specific = config['Strand_specific']
##cufflink
cuff_lib_type = config['cuff_lib_type']
SAMPLE_LIST = ['A1', 'F4', 'P3', 'Pa', 'Pc', 'Pd', 'Pt13', 'Pt']
LABLES_string = []
for i in sample:
a = "data/cuffquant/%s/abundances.cxb" % i
LABLES_string.append(a)
# SAMPLE_LIST = ['A1', 'F4', 'P3', 'Pa', 'Pc', 'Pd', 'Pt13', 'Pt']
# LABLES_string = []
# for i in sample:
# a = "data/cuffquant/%s/abundances.cxb" % i
# LABLES_string.append(a)
#print (LABLES_string)
#print(Sample)
###############outpath################
tri_path = config['outputpath']['trimmomatic']
hisat2_path = config['outputpath']['hisat2']
tophat2_path = config['outputpath']['tophat2']
stringtie_path = config['outputpath']['stringtie']
cuffquant_path = config['outputpath']['cuffquant']
cuffdiff_path = config['outputpath']['cuffdiff']
Expand Down
43 changes: 43 additions & 0 deletions bin/cufflinks/filt_gtf_after_cufflinks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env python
import argparse
import sys
import re
from itertools import *
parser = argparse.ArgumentParser(description="Filter the isoforms by the cov(2), exon numbers(2) and length(200). Use this script after cufflinks as a subsitution for next analysis")
parser.add_argument("-g","--gtf", action="store", dest="gtf",type=str, required=True,
help="GTF file generated by cufflinks ")
parser.add_argument("-i","--isoforms", action="store",type = str, dest="isoforms", required=True,\
help="isoforms.fpkm_tracking file generated by cufflinks")
parser.add_argument("-o","--out", action="store",type = str, dest="out", default='filter.gtf',\
help="the name of the output file")
argvs = parser.parse_args()
LEN = 200
COV = 2
NUM = 2

def filter_dic(dic):
for x in dic.keys():
if float(dic[x]['trans_len']) < LEN or float(dic[x]['trans_cov']) < COV or float(dic[x]['exon']) < NUM:
del dic[x]
else:pass
return dic
with open(argvs.isoforms) as IN1,open(argvs.gtf) as IN2:
IN1_dic = {}
for i in islice(IN1,1,None):
ALL = i.strip().split('\t')
IN1_dic[ALL[0]] = {}
IN1_dic[ALL[0]]['trans_len'] = ALL[7]
IN1_dic[ALL[0]]['trans_cov'] = ALL[8]
for i in IN2.readlines():
trans_id = re.findall(r'transcript_id "(.*?)";',i)[0]
if i.strip().split('\t')[2] == 'transcript':
IN1_dic[trans_id]['exon'] = 0
elif i.strip().split('\t')[2] == 'exon':
IN1_dic[trans_id]['exon'] += 1
IN1_dic = filter_dic(IN1_dic)
with open(argvs.out,'w') as OUT ,open(argvs.gtf) as IN2:
for i in IN2.readlines():
ID = re.findall(r'transcript_id "(.*?)";',i)[0]
if ID in IN1_dic:
OUT.write(i)
else:pass
Loading

0 comments on commit 128c7a1

Please sign in to comment.