-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChromosomeWalk.py
171 lines (148 loc) · 6.54 KB
/
ChromosomeWalk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
ChromosomeWalk.py
by: Tovah Markowitz
date: 5/11/16
based upon: ChromosomeWalk.py
purpose: to determine the sequence of a genome directly downstream of a known starting point using fastq data
input: start sequence, length to walk, and fastq file
output: raw sequence of length chosen beginning with start sequence
"""
############
# Modules
#from Bio import SeqIO
import re
from datetime import datetime
import optparse
############
# Functions
def Walk (fastqFile, start, output):
# to walk along a chromosome, starting with "start" sequence
# and continuing up to 500bp away using sequencing read information
print fastqFile
# read in all sequences from FASTQ file
reads = seq_from_fastq(fastqFile)
readLen = len(reads[0])
tmp = [ ]
# input sequence (start) defines length of overlap
overlap = len(start)
print start
# get sequence list to include reverse and complement of all reads
reads3 = [get_reverse_complement(ind_read) for ind_read in reads]
reads = reads + reads3
out = [ ]
max_len = overlap
max_len1 = overlap+1
i = 0
# ways this function prevents neverending loops:
# 1) maximum output fragment length is kept to 500bp
# 2) if fragment length stops growing, halt loop
# 3) if number of sequences to analyze becomes too great
while ((max_len1 < 500) & (max_len < max_len1) & (len(tmp) < 25) ):
if i == 0:
tmp = [start]
else:
# tmp == sequences to continue analyzing/appending to
max_len = max_len1
print "Round:" + str(i)
print datetime.now()
print "Max length: " + str(max_len)
print "Going to analyze " + str(len(tmp)) + " sequences"
tmp3 = [ ]
for seq in tmp:
# tmp2 == for searching on a single sequence
tmp2 = [ ]
for ind_read in reads:
# for each sequence in the searchable list tmp take last bases of length overlap as new search string
a = re.search(seq[-overlap:],ind_read)
if bool(a):
# if sequence and read have any sequence upstream of searched sequence
# make sure overlapping sequence is identical
if overlap < len(ind_read[:a.end()]):
if len(seq) > len(ind_read[:a.end()]):
if seq.endswith(ind_read[:a.end()]):
# double check that length of addition is within reason
if len(ind_read[a.end():]) <= (readLen - overlap):
tmp2.append(seq + ind_read[a.end():] )
else: print ind_read + " has extension value that is too high"
else:
if ind_read[:a.end()].endswith(seq):
# double check that length of addition is within reason
if len(ind_read[a.end():]) <= (readLen - overlap):
tmp2.append(seq + ind_read[a.end():] )
else: print ind_read + " has extension value that is too high"
else:
# double check that length of addition is within reason
if len(ind_read[a.end():]) == (readLen - overlap):
tmp2.append(seq + ind_read[a.end():])
else: print ind_read + "does not fit criteria"
# if none of the reads have the search sequence send sequence to out
# otherwise send to tmp3
if bool(tmp2) == False:
out.append(seq)
else:
tmp3[len(tmp3):len(tmp3)] = tmp2
# tmp4 == shorter fragments that are already encompassed into a longer fragment
tmp3 = set(tmp3)
tmp3 = list(tmp3)
tmp4 = [ ]
for j in range(len(tmp3)):
for k in range(len(tmp3)):
if len(tmp3[j]) > len(tmp3[k]):
# if smaller fragment is found within larger fragment, we don't want it as an output
if tmp3[j].find(tmp3[k]) != -1:
tmp4.append(tmp3[k])
if len(tmp3[j]) > max_len1:
max_len1 = len(tmp3[j])
# remove duplicates, and prepare for later steps
tmp4 = set(tmp4)
tmp3 = set(tmp3)
tmp = tmp3.difference(tmp4)
i += 1
out[len(out):len(out)]= tmp3.difference(tmp4)
j = 1
f = open(output,'w')
for k in out:
f.write('>' + str(j) + '\n' + k + '\n')
j += 1
f.close()
def seq_from_fastq(fastqFile):
# get all sequences from the fastq file
f = open (fastqFile,'r')
reads = [ ]
for i,line in enumerate(f):
if i % 4 == 1:
reads.append( line.strip('\n') )
f.close()
return reads
def get_complement(sequence):
conversion_map = {'A':'T' , 'G':'C' , 'C':'G' , 'T':'A', 'N':'N'}
return ''.join( [conversion_map[i] for i in sequence] )
def get_reverse_complement(sequence):
return get_complement( sequence )[::-1]
##################
# MAIN
desc="""
This script is designed to use FASTQ sequence data to walk along a short portion of the genome.
This function does not take into account read quality or the number of times a read is found.
It works by only searching for perfect matches of fragments (all search terms are equal length
to original start sequence). It does require: 2X the size of the FASTQ file as memory and a long
run time (4h average). To minimize neverending loops, I have incorporated the following
terminators to the complete the function: a) no additional reads are found to continue a fragment,
b) longest output fragment reaches beyond 500bp, and c) many potential sequences are identified (over 25).
"""
# parse object for managing input options
parser = optparse.OptionParser(description=desc)
# essential data, defines commandline options
parser.add_option ('-f', dest = 'fastqFile', default = '', help = "This input \
is the name of the fastq file to be used.")
parser.add_option ('-s', dest = 'start', default = '', help = "This input \
is the starting sequence to walk from. Must be all caps.")
parser.add_option ('-o', dest = 'output', default = '', help = "This input \
is the name of the newly created output file.")
# load the inputs
(options,args) = parser.parse_args()
# reads the inputs from commandline
fastqFile = options.fastqFile
start = options.start
output = options.output
Walk(fastqFile, start, output)