-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBioWrapper.py
More file actions
245 lines (225 loc) · 7.12 KB
/
BioWrapper.py
File metadata and controls
245 lines (225 loc) · 7.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/python
import os
from Bio import Entrez
from Bio import SeqIO
############################
# Functions #
############################
def get_gene(gene_name,gene_list):
for gene in gene_list:
if gene.name == gene_name:
return gene
print 'gene not found!'
return
def nice_write(s,ind=60):
o=''
lines=len(s)/ind
if lines==0:
return s + '\n'
for l in range(1,lines+1):
st,end=ind*(l-1),(ind*l)-1
o= o + s[st:end+1] + '\n'
o= o + s[end+1:] + '\n'
return o
def rev_complement(seq):
# get reverse complement of a sequence
d= {'A': 'T', 'C': 'G',
'G': 'C', 'T': 'A',
'a': 't', 'c': 'g',
'g': 'c', 't': 'a'}
rev_compl=''.join(d.get(n, 'N') for n in seq[::-1])
return rev_compl
def get_from_genbank(accession,mail='emanuele.bosi@unifi.it'):
Entrez.email=mail
handle = Entrez.efetch(db="nucleotide",
id=accession,
rettype="gbwithparts")
record = SeqIO.read(handle,"genbank")
handle.close()
return record
def get_genes(record):
#return only genes as list of SeqFeatures
genes = [Gene(r) for r in record.features if r.type == 'CDS' and 'gene' in r.qualifiers.keys() ]
return genes
def get_gene_name(feat):
#return gene name(s)
if len(feat.qualifiers['gene']) > 1:
name=','.join(feat.qualifiers['gene'])
else:
name=feat.qualifiers['gene'][0]
return name
def get_locus_tag(feat):
#return gene name(s)
if len(feat.qualifiers['locus_tag']) > 1:
name=','.join(feat.qualifiers['locus_tag'])
else:
name=feat.qualifiers['locus_tag'][0]
return name
def get_whole_genome_seq(record):
#return whole genome seq
whole_genome= record.seq.tostring()
return whole_genome
def get_subseq(tupla,genome_seq,flag=False,b_up=400,b_down=0):
#given some indices, return the corresponding genomic subseq
strand=tupla[-1]
if tupla[0] <= tupla[1]:
if strand == '+':
left=max(tupla[0],tupla[1]-b_up)
right=tupla[1]+b_down
return genome_seq[left:right]
else:
left=tupla[0]-b_down
right=min(tupla[1],tupla[0]+b_up)
return rev_complement(genome_seq[left:right])
if flag == 'wrapped around':
if strand=='+':
seq= ( genome_seq[tupla[0]:]
+
genome_seq[:tupla[1]+b_down]
)
if len(seq) > b_up+b_down:
seq=seq[-b_up:]
return seq
else:
seq= ( rev_complement(genome_seq[:tupla[1]])
+
rev_complement(genome_seq[tupla[0]-b_down:])
)
if len(seq) > b_up+b_down:
seq=seq[-b_up:]
return seq
if b_down > 0:
if strand == '+':
left=tupla[1]
right=tupla[1]+b_down
return genome_seq[left:right]
else:
left=tupla[0]-b_down
right=tupla[0]
return rev_complement(genome_seq[left:right])
return ''
def get_subseq_all(tupla,genome_seq,flag=False,b_up=400,b_down=0):
#given some indices, return the corresponding genomic subseq
strand=tupla[-1]
if tupla[0] <= tupla[1]:
if strand == '+':
left=tupla[0]
if left < 0:
left = 0
right=tupla[1]
return genome_seq[left:right]
else:
left=tupla[0]
right=tupla[0]
if right > len(genome_seq):
right = len(genome_seq)
return rev_complement(genome_seq[left:right])
if flag == 'wrapped around':
if strand=='+':
seq= ( genome_seq[tupla[0]:]
+
genome_seq[:tupla[1]]
)
if len(seq) > b_up+b_down:
seq=seq[-b_up:]
return seq
else:
seq= ( rev_complement(genome_seq[:tupla[1]])
+
rev_complement(genome_seq[tupla[0]:])
)
if len(seq) > b_up+b_down:
seq=seq[-b_up:]
return seq
if b_down > 0:
if strand == '+':
left=tupla[1]
right=tupla[1]
return genome_seq[left:right]
else:
left=tupla[0]
right=tupla[0]
return rev_complement(genome_seq[left:right])
return ''
def get_all_upstreams(genes,whole_seq,b_up=400,b_down=0,is_circ=True):
tot=len(genes)
if tot == 0:
return
elif tot == 1:
if genes[0].strand == '+':
upstr = [(genes[0].start,genes[0].start,genes[0].name,genes[0].strand)]
else:
upstr = [(genes[0].end,genes[0].end,genes[0].name,genes[0].strand)]
else:
upstr=[ (genes[i-1].end,genes[i].start,genes[i].name,genes[i].strand) if genes[i].strand == '+'
else
(genes[i].end,genes[i+1].start,genes[i].name,genes[i].strand)
for i in range(-1,tot-1)]
upstr= upstr[1:] + [upstr[0]]
if is_circ:
out=[Upstream(upstr[0][2],upstr[0][1],upstr[0][0],get_subseq(upstr[0],whole_seq,'wrapped around',b_up,b_down),upstr[0][3],b_up,b_down)]
for up in upstr[1:tot - 1]:
if up[0] > up[1]:
if up[3] == '+':
u = Upstream(up[2],up[1]+b_down,up[1],get_subseq(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down)
else:
u = Upstream(up[2],up[1],up[1]-b_down,get_subseq(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down)
else:
if up[3] == '+':
u = Upstream(up[2],up[1]+b_down,up[0],get_subseq(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down)
else:
u = Upstream(up[2],up[1],up[0]-b_down,get_subseq(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down)
out.append(u)
out+=[Upstream(upstr[-1][2],upstr[-1][1],upstr[-1][0],get_subseq(upstr[-1],whole_seq,'wrapped around',b_up,b_down),upstr[-1][3],b_up,b_down)]
else:
out=[Upstream(up[2],up[1],up[0],get_subseq(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down) for up in upstr]
return out
def get_all_upstreams_all(genes,whole_seq,b_up=400,b_down=0,is_circ=True):
tot=len(genes)
if tot == 0:
return
elif tot == 1:
if genes[0].strand == '+':
upstr = [(genes[0].start-b_up,genes[0].start+b_down,genes[0].name,genes[0].strand)]
else:
upstr = [(genes[0].end-b_down,genes[0].end+b_up,genes[0].name,genes[0].strand)]
else:
upstr=[ (genes[i].start-b_up,genes[i].start+b_down,genes[i].name,genes[i].strand) if genes[i].strand == '+'
else
(genes[i].end-b_down,genes[i].end+b_up,genes[i].name,genes[i].strand)
for i in range(-1,tot-1)]
upstr= upstr[1:] + [upstr[0]]
if is_circ:
out=[Upstream(upstr[0][2],upstr[0][1],upstr[0][0],get_subseq_all(upstr[0],whole_seq,'wrapped around',b_up,b_down),upstr[0][3],b_up,b_down)]
out+=[Upstream(up[2],up[1],up[0],get_subseq_all(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down) for up in upstr[1:tot - 1]]
out+=[Upstream(upstr[-1][2],upstr[-1][1],upstr[-1][0],get_subseq_all(upstr[-1],whole_seq,'wrapped around',b_up,b_down),upstr[-1][3],b_up,b_down)]
else:
out=[Upstream(up[2],up[1],up[0],get_subseq_all(up,whole_seq,b_up=b_up,b_down=b_down),up[3],b_up,b_down) for up in upstr]
return out
############################
# Classes #
############################
class Gene(object):
#
def __init__(self,SeqFeature):
self.name=get_gene_name(SeqFeature)
self.start=int(SeqFeature.location.start)
self.end=int(SeqFeature.location.end)
self.strand='+' if SeqFeature.location.strand == 1 else '-'
#
def get_sequence(self,complete_genome):
tupla=tuple((self.start,self.end,self.strand))
return get_subseq(tupla,complete_genome)
#
#
class MLST(object):
def __init__(self,file_):
self.file_=file_
self.path='/' + '/'.join(self.file_.split('/')[:-1])
self.file_name=self.file_.split('/')[0]
self.locus_name=self.file_name.split('_')[0]
handle = open(self.file_, "rU")
self.alleles=[record for record in SeqIO.parse(handle, "fasta")]
handle.close()
#
#