-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvplot.py
308 lines (284 loc) · 12 KB
/
vplot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
# vplot.py
# made by Tovah Markowitz
# uses python3
# 1/22/18
#########################
# MODULES
import matplotlib.pyplot as plt
import numpy
import optparse
from datetime import datetime
import pysam # requires 2.7.12 on HPC
#########################
# FUNCTIONS
def import_sam_pysam(sam_file):
"""Import sam file using pysam and save only query name, chromosome
name, start position, and fragment length as a list of lists.
"""
print("Reading in SAM file.")
print(datetime.now())
sam_in = pysam.AlignmentFile(sam_file,"r")
sam_in2 = [row for row in sam_in.fetch(until_eof=True) ]
sam_map = [[row.query_name,
row.reference_name, # chromosome name
(row.reference_start + 1), # 0-based
row.template_length ] # fragment length
for row in sam_in2 if (row.flag != 77) & (row.flag != 141)]
print("Finished.")
print(datetime.now())
return sam_map
def parse_pair_pysam(sam_list):
"""Parse paired-end sequencing data to extract chromosomes,
fragment midpoint, and fragment length. The input should be the output
of import_sam_pysam. The output is a dictionary where the keys are
chromosomes and the values are lists of midpoint and fragment length
pairs. This function only works when reads can only be mapped once.
"""
sam_list.sort()
chrs = set(row[1] for row in sam_list)
sam_dict = {chrom: [ ] for chrom in chrs}
for i in range(len(sam_list) - 1):
if (i % 5000000) == 0:
print("Parsed " + str(i) + " lines.")
print(datetime.now())
if " " in sam_list[i][0]:
print("Caution: read was not processed. " + sam_list[i][0])
elif sam_list[i][0] == sam_list[i + 1][0]:
if sam_list[i][1] == sam_list[i + 1][1]:
chrom = sam_list[i][1]
frag_len = abs(int(sam_list[i][3]))
start = min(int(sam_list[i][2]), int(sam_list[i + 1][2]))
end = start + frag_len
mid = int((start + end) / 2)
sam_dict[chrom].append([mid, frag_len])
else:
print("Caution: paired reads mapped to different chromosomes. "
+ sam_list[i][0])
return sam_dict
def import_sam(sam_file):
"""Read in sam file of sequencing data. The output is a list of lists
containing data on all mapping reads.
"""
print("Reading in SAM file.")
print(datetime.now())
f = open(sam_file, 'r')
sam_in = f.readlines()
f.close
sam = [row.strip().split('\t') for row in sam_in
if not row.startswith("@")]
# removing rows with FLAG indicating no mapping of paired reads
sam_map = [row for row in sam if (row[1] != '77') & (row[1] != '141')]
print("Finished.")
print(datetime.now())
return sam_map
def parse_pair_sam(sam_list):
"""Parse paired-end sequencing data to extract chromosomes, fragment
midpoint, and fragment length. The input should be the output of
import_sam. The output is a dictionary where the keys are chromosomes
and the values are lists of midpoint and fragment length pairs. This
function only works when reads can only be mapped once.
"""
sam_list.sort()
chrs = set(row[2] for row in sam_list)
sam_dict = {chrom: [ ] for chrom in chrs}
for i in range(len(sam_list) - 1):
if (i % 5000000) == 0:
print("Parsed " + str(i) + " lines.")
print(datetime.now())
if " " in sam_list[i][0]:
print("Caution: read was not processed. " + sam_list[i][0])
elif sam_list[i][0] == sam_list[i + 1][0]:
if sam_list[i][2] == sam_list[i + 1][2]:
if sam_list[i][3] == sam_list[i + 1][7]:
chrom = sam_list[i][2]
frag_len = abs(int(sam_list[i][8]))
start = min(int(sam_list[i][3]), int(sam_list[i][7]))
end = start + frag_len
mid = int((start + end) / 2)
sam_dict[chrom].append([mid, frag_len])
else:
print("Caution: pair positions don't match. "
+ sam_list[i][0])
else:
print("Caution: paired reads mapped to different chromosomes. "
+ sam_list[i][0])
return sam_dict
def extract_lengths(sam_dict):
"""Extract fragment lengths of all mapped reads from dictionary.
"""
lengths = [ ]
for key in sam_dict.keys():
tmp = [row[1] for row in sam_dict[key]]
lengths.extend(tmp)
lengths.sort()
return lengths
def plot_hist(data, out_hist, title="", xlab=""):
"""Plot a list of data as a histogram.
"""
if "." in out_hist:
outfile_name = out_hist
else:
outfile_name = out_hist + ".pdf"
# plt.hist(data, bins=100, density=True)
plt.hist(data, bins=100, normed=1)
plt.title(title)
plt.xlabel(xlab)
plt.ylabel("Density")
plt.savefig(outfile_name)
plt.close("all")
def import_bed(bedfile):
"""Read in bed file and extract chromosome and midpoints.
Orientation ignored.
"""
f = open(bedfile, 'r')
bed_in = f.readlines()
f.close
bed = [row.strip().split('\t') for row in bed_in]
chrs = set(row[0] for row in bed)
midpoints = {chrom: [ ] for chrom in chrs}
for row in bed:
midpoint = int((int(row[1]) + int(row[2]))/2)
midpoints[row[0]].append(midpoint)
for key in midpoints.keys():
midpoints[key].sort()
return midpoints
def find_nearby(sam_dict, midpoints, max_range):
"""Identify all reads that are within a certain distance of a feature
of interest. Assumes both inputs [sam_dict and midpoints] are
dictionaries with keys being chromosome names. Max_range is the maximum
distance from the midpoints that will be used on the plot.
"""
nearby = [ ]
for key in midpoints.keys():
print("Starting to process " + key + " for vplot")
print(datetime.now())
for midpoint in midpoints[key]:
if sam_dict.get(key):
tmp=[[read[0] - midpoint, read[1]] for read
in sam_dict[key]
if abs(midpoint - read[0]) < max_range]
nearby.extend(tmp)
return nearby
def group_data(nearby, dist_window_size, fraglen_window_size,
max_range,max_frag_length):
"""Group data from find_nearby into windows based upon both fragment
length and distance from feature midpoints.
"""
maxfrag = max([row[1] for row in nearby])
if maxfrag > max_frag_length:
maxfrag = max_frag_length
minfrag = min([row[1] for row in nearby])
frag_cut = [a for a in range(minfrag, maxfrag, fraglen_window_size)]
dist_cut = [a for a in range(
int(max_range / dist_window_size) * -dist_window_size,
max_range + dist_window_size, dist_window_size)]
grouped_data = [[] for _ in range(len(frag_cut)-1)]
for i in range(len(frag_cut)-1):
within_frag = [data for data in nearby if
frag_cut[i] <= data[1] < frag_cut[i+1]]
for j in range(len(dist_cut)-1):
within_dist = [data for data in within_frag if
dist_cut[j] <= data[0] < dist_cut[j+1]]
grouped_data[i].append(len(within_dist))
return(grouped_data, frag_cut, dist_cut)
def vplot(grouped_data, frag_cut_points, dist_cut_points, out_vplot, xlab="",
title=""):
"""Plot group_data as a heatmap with axes identifying distances.
"""
if "." in out_vplot:
outfile_name = out_vplot
else:
outfile_name = out_vplot + ".pdf"
plt.imshow(grouped_data, cmap=plt.cm.bone_r, origin='lower', aspect='auto')
plt.colorbar()
min_dist = dist_cut_points[0]
max_dist = dist_cut_points[-1]
byX = len(grouped_data[0]) / 4
plt.xticks(numpy.arange(-0.5, len(grouped_data[0]), byX),
[min_dist, min_dist / 2, 0, max_dist / 2, max_dist])
min_frag = frag_cut_points[0]
max_frag = frag_cut_points[-1]
frag_dist = (max_frag - min_frag) / 4
byY = len(grouped_data) / 4
plt.yticks(numpy.arange(-0.5, len(grouped_data), byY),
[min_frag, min_frag + frag_dist, min_frag + 2*frag_dist,
max_frag - frag_dist, max_frag])
plt.title(title)
if xlab:
plt.xlabel(xlab)
else:
plt.xlabel("Distance from feature (bp)")
plt.ylabel("Fragment size (bp)")
plt.savefig(outfile_name)
plt.close("all")
#########################
# MAIN
desc="""
This script is designed to replicate that of the v-plot used by the Henikoff
lab. See PMID: 22025700 for details. Specifically, the plot is a 2-D heatmap
where the x-axis is the distance from a specified feature (such as a motif) and
the y-axis is the fragment size, and intensity indicates the number of reads
within a given bin. In the words of the Henikoff lab: the length of each
fragment was plotted as a function of the distance from the fragment midpoint
to the center of the site for each annotated feature. Heatmaps were generated
by quantifying the number of reads at each relative distance and length
coordinate.
"""
# parse object for managing input options
parser = optparse.OptionParser(description=desc)
# essential data, defines commandline options
parser.add_option('-b', dest='bedfile', default='', help="The bed file \
fragments should be compared to. Optional: only required for vplot.")
parser.add_option('--hist', dest='outhist', default='', help="The name of \
the histogram output file. Optional: Necessary only for making histogram.")
parser.add_option('--maxdist', dest="maxdistance", default=1000, help="Maximum \
distance away from feature of interest to be analyzed. Determines x-axis of \
vplot. Default: 1000 bp")
parser.add_option('--maxlen', dest="maxlength", default=500, help="Maximum \
sequenced fragment length analyzed. Default: 500 bp")
parser.add_option('-s', dest='samfile', default ='', help="The SAM file to \
be analyzed.")
parser.add_option('-t', dest='title', default='', help="The title for the \
figures. Optional.")
parser.add_option('-v', dest="outvplot", default='', help="The name of the \
vplot outplot file. Optional: only required if making a vplot.")
parser.add_option('--windist', dest='windowdist', default=25, help="Window \
size for x-axis (distance from feature) of vplot. Default: 25 bp")
parser.add_option('--winlen', dest='windowlen', default=5, help="Window \
size for y-axis (fragment size) of vplot. Default: 5 bp")
parser.add_option('-x', dest='xlabvplot', default='', help="The label for the \
x-axis on the vplot. For example, 'distance from hotspots'. Optional.")
# load the inputs
(options,args) = parser.parse_args()
# reads the inputs from commandline
bedfile = options.bedfile
maxdistance = options.maxdistance
maxlength = options.maxlength
outhist = options.outhist
outvplot = options.outvplot
samfile = options.samfile
title = options.title
windowdist = options.windowdist
windowlen = options.windowlen
xlabvplot = options.xlabvplot
if __name__ == "__main__":
sam_list = import_sam_pysam(samfile)
sam_dict = parse_pair_pysam(sam_list)
# sam_list = import_sam(samfile)
# sam_dict = parse_pair_sam(sam_list)
if (outlist is not "") | (outhist is not ""):
lengths = extract_lengths(sam_dict)
if outhist:
plot_hist(lengths, out_hist=outhist, title=title,
xlab="sequenced fragment length (bp)")
if outvplot:
if not bedfile:
print("No bed file indicated for vplot analysis.")
else:
midpoints = import_bed(bedfile)
nearby = find_nearby(sam_dict, midpoints, int(maxdistance))
(grouped, frag_cut_points, dist_cut_points) = group_data(nearby,
int(windowdist), int(windowlen), int(maxdistance),
int(maxlength))
vplot(grouped, frag_cut_points, dist_cut_points, outvplot,
xlabvplot, title)