-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathextract_CDSs_from_GenBank.awk
executable file
·509 lines (430 loc) · 20.6 KB
/
extract_CDSs_from_GenBank.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#!/usr/bin/awk -f
#---------------------------------------------------------------------------------------------------------
#: PROGRAM: extract_CDSs_from_GenBank.awk; first commmit Dec 23, 2020
#: AUTHOR: Pablo Vinuesa, @pvinmex, https://www.ccg.unam.mx/~vinuesa/
#: SOURCE: https://github.com/vinuesa/intro2linux
#: USAGE: extract_CDSs_from_genbank.awk genbank_file.gbk
#: AIMS:
#: 1. extracts the CDSs from a single GenBank file and writes them to genbank_basename_CDSs.fna
#: 2. translates the CDSs and writes them to file genbank_basename_proteome.faa
#: 3. writes the GenBank file in tabular format as genbank_basename.tsv
#: 4. extracts the complete DNA string(s) from the input GenBank saving them to genbank_basename.fsa
#: NOTES:
#: 1. the program currently does not deal with CDSs containing introns.
#: CDSs containing the join statement: complement(join(4497616..4498557,4498557..4498814)), are skipped
#: Use only for bacterial|mitochondrial|plastid|yeast genomes
#=========================================================================================================
BEGIN {
# Initializations
DEBUG = 0 # set to 1 if debugging messages should be activated (prints to "/dev/stderr")
# print the FASTA sequences stored in hashes in ascending order by locus_tag number
PROCINFO["sorted_in"] = "@ind_num_asc"
progname="extract_CDSs_from_GenBank.awk"
VERSION=0.8 # v0.8 Jan 1, 2021; prints the CDS and product lengths to the table
# v0.7 Dec 29, 2020;
# * captures also the gene_name, adding it to the table after locus_tag and
# appending it to product [gene_name] as such or as [NONAME] if not available
# v0.6 Dec 29, 2020;
# * now also captures the protein_id to label prots in *faa files; it is added to tsv outfile
# * Only CDSs are labeled with locus_tag
# * Manages input gbks like E. coli MT559985 that do not have locus_tag attributes,
# labeling them with locus_id"_CDS"geneID
# v0.5 Dec 28, 2020; slightly more streamlined (awkish) code, by using more awk defaults (avoid $0 ~ /regexp/, etc)
# prints LOCUS_length to *tsv file using a nested AoA as the 1ary dna AoA key: dna[CDSs_AoA[k]["LOCUS"]]["len"]
# v0.4 Dec 26, 2020; prints also the CDS's translation products to file
# v0.3 Dec 25, 2020; correctly appends the [pseudogene] label to the end of the product line
# v0.2 Dec 24, 2020; now captures full product name, even when split over two lines
# v0.1 Dec 23, 2020; first commit
# check user input: needs the GenBank file to process
# as single argument on the command line
if(ARGC < 2)
print_help(progname, VERSION)
# capture the gbk basename for naming output files written by this script
input_gbk = ARGV[1]
basename = input_gbk
gsub(/\..*$/, "", basename)
gbk_tsv = basename".tsv"
gbk_faa = basename"_proteome.faa"
gbk_fsa = basename".fsa"
gbk_CDSs = basename"_CDSs.fna"
# hash used by rev_compl()
compl_nucl["T"]="A"
compl_nucl["A"]="T"
compl_nucl["C"]="G"
compl_nucl["G"]="C"
compl_nucl["N"]="N"
# initialize the "codons" hash holding the codon-aminoacid pairs,
# based on the universal genetic code, to translate CDSs with translate_dna()
codons["ATA"]="I"; codons["ATC"]="I"; codons["ATT"]="I"; codons["ATG"]="M";
codons["ACA"]="T"; codons["ACC"]="T"; codons["ACG"]="T"; codons["ACT"]="T";
codons["AAC"]="N"; codons["AAT"]="N"; codons["AAA"]="K"; codons["AAG"]="K";
codons["AGC"]="S"; codons["AGT"]="S"; codons["AGA"]="R"; codons["AGG"]="R";
codons["CTA"]="L"; codons["CTC"]="L"; codons["CTG"]="L"; codons["CTT"]="L";
codons["CCA"]="P"; codons["CCC"]="P"; codons["CCG"]="P"; codons["CCT"]="P";
codons["CAC"]="H"; codons["CAT"]="H"; codons["CAA"]="Q"; codons["CAG"]="Q";
codons["CGA"]="R"; codons["CGC"]="R"; codons["CGG"]="R"; codons["CGT"]="R";
codons["GTA"]="V"; codons["GTC"]="V"; codons["GTG"]="V"; codons["GTT"]="V";
codons["GCA"]="A"; codons["GCC"]="A"; codons["GCG"]="A"; codons["GCT"]="A";
codons["GAC"]="D"; codons["GAT"]="D"; codons["GAA"]="E"; codons["GAG"]="E";
codons["GGA"]="G"; codons["GGC"]="G"; codons["GGG"]="G"; codons["GGT"]="G";
codons["TCA"]="S"; codons["TCC"]="S"; codons["TCG"]="S"; codons["TCT"]="S";
codons["TTC"]="F"; codons["TTT"]="F"; codons["TTA"]="L"; codons["TTG"]="L";
codons["TAC"]="Y"; codons["TAT"]="Y"; codons["TAA"]="*"; codons["TAG"]="*";
codons["TGC"]="C"; codons["TGT"]="C"; codons["TGA"]="*"; codons["TGG"]="W";
unknown = "X"
}
#-------------------------------- END OF BEGIN BLOCK ----------------------------------#
# The following block will parse the relevant annotation features of CDSs,
# saving them in the the CDSs_AoA (Array of Arrays)
# Capture the locus ID on the first line to add to the FASTA header
# Then skip all lines, from the first record (line) to the source FEATURE,
# Note the use of an expr-REGEXP range: expr, /regexp/
FNR == 1 || /^LOCUS/ {
locus_id = $2
printf(">>> Processing LOCUS %s of %s ...\n", locus_id, FILENAME) > "/dev/stderr"
}
# skip lines until the source attribute of the FEATURES block is reached
/^LOCUS/, /^\s{4,}source\s{4,}/ { next }
{
# Exclude pseudogenes, indicated by truncated gene coordinates with [<>],
# or by the join statement: complement(join(4497616..4498557,4498557..4498814))
if (/^\s+CDS\s{4,}[[:digit:]]+/ && flag == 0 && !/[<>,]/ && !/complem/ && !/join/)
{
flag = 1
# note the pattern capturing parentheses used in this regexp,
# which capture the coords 123..789 into the c_arr array
match($0, /^\s{4,}CDS\s{4,}([[:digit:]]+)\.\.([[:digit:]]+)/, c_arr)
start = c_arr[1]
end = c_arr[2]
complflag = 0
}
if (/^\s+CDS\s{4,}complement/ && ! flag && ! /[<>,]/ && !/join/)
{
flag = 1
coord = $0
# for the sake of showing different string-manipulation functions
# in this case we combine sub(), gsub() and split()
# to capture the CDS's coords
sub(/^\s{4,}CDS\s{4,}complement\(/, "", coord)
gsub(/[\)]/, "", coord)
split(coord, c_arr, /\.\./)
start = c_arr[1]
end = c_arr[2]
complflag = 1
}
if( flag && /^\s+\/locus_tag=/ )
{
match($0, /^\s{8,}\/locus_tag="([[:alnum:]_]+)"/, lt_arr)
locus_tag = lt_arr[1]
}
if( flag && /^\s+\/gene=/ )
{
match($0, /^\s{8,}\/gene="(\S+)"/, g_arr)
gene_name = g_arr[1]
}
# mark pseudogenes
if( flag && /^\s{8,}\/pseudo/ ) pseudoflag = 1
if(/^\s{8,}\/product=/)
{
line_no = FNR
product = $0
sub(/^\s{8,}\/product=/, "", product)
gsub(/["]/, "", product)
gsub(/[>]/, "=", product) # # remove '>' from (adenine(2058)-N(6))-dimethyltransferase > Erm(B)"
# we use geneID to generate a numeric index for the hash, so that it prints out in order!
geneID++
# All the CDS's-relevant information parsed from the annotations
# will be stored in the CDSs_AoA (Array of Arrays)
if (locus_tag) CDSs_AoA[geneID]["l"] = locus_tag
# some GenBank files like Escherichia_coli MT559985 do not have a locus_tag!
if (!locus_tag) CDSs_AoA[geneID]["l"] = locus_id"_CDS"geneID
if(gene_name) CDSs_AoA[geneID]["gene"] = gene_name
if(!gene_name) CDSs_AoA[geneID]["gene"] = "NONAME"
CDSs_AoA[geneID]["LOCUS"] = locus_id
CDSs_AoA[geneID]["s"] = start
CDSs_AoA[geneID]["e"] = end
CDSs_AoA[geneID]["c"] = complflag
CDSs_AoA[geneID]["pseudo"] = pseudoflag
complflag=0
gene_name = ""
}
# if the product description is split in two lines,
# capture it and append it to the first line already saved in product
if(FNR == line_no+1)
{
# this regexp is critical to ensure the caputre of
# all possible product names, like ANT(3'')-Ia, or
# (adenine(2058)-N(6))-dimethyltransferase > Erm(B)", that must end with "
# Set DEBUG = 1 at the beginning of the BEGIN{} block
# to print all the second lines for producut names to "/dev/stderr"
#if($0 ~/^\s+[[:alnum:] ]+\"$/)
if( /^\s+[a-zA-Z0-9\(\)\'\-\[\],;:> ]+"$/ )
{
prod_2cnd_line = $0
gsub(/^\s+/, "", prod_2cnd_line)
gsub(/["]/, "", prod_2cnd_line)
gsub(/[>]/, "=", prod_2cnd_line) # (adenine(2058)-N(6))-dimethyltransferase > Erm(B)"
product = product " " prod_2cnd_line
if(DEBUG)
print "FNR=" FNR, "line_no="line_no, "locus_tag="locus_tag,
"prod_2cnd_line="prod_2cnd_line > "/dev/stderr"
if (pseudoflag) product = product " [pseudogene]"
CDSs_AoA[geneID]["p"] = product #" ["CDSs_AoA[geneID]["gene"]"]"
line_no=pseudoflag=0
}
else
{
if (pseudoflag) product = product " [pseudogene]"
CDSs_AoA[geneID]["p"] = product #" ["CDSs_AoA[geneID]["gene"]"]"
line_no=pseudoflag=0
}
}
if( /^\s+\/protein_id="/ )
{
match($0, /^\s{8,}\/protein_id="(\S+)"$/, pi_arr)
CDSs_AoA[geneID]["pid"] = pi_arr[1]
if (DEBUG) print "protein_id="pi_arr[1]
flag = 0
}
# After the ORIGING mark, starts the dna string
if (flag == 0 && !/^ORIGIN/)
next
# Remove spaces and digits preceding sequence strings
# remove also the single spaces between 10 bp sequence strings
if (/^ORIGIN/) { flag = 1; seq = ""; next }
if(flag && /^\s+[[:digit:]]+\s+[AGCTNagctn\s]+/)
{
sub(/^\s+[[:digit:]]+\s+/, "")
gsub(/[[:space:]]/, "")
seq = seq $0
}
# fill the dna AoA, once we have reached the end of record mark '//'
if(flag && /^\/\/$/)
{
# save the genbank's DNA string and lenght in the dna array of arrays,
# indexed by [locus_id]["seq"] and [locus_id]["len"], respectively
seq = toupper(seq)
dna[locus_id]["seq"] = seq
dna[locus_id]["len"] = length(seq)
flag = 0
locus_id=""
}
}
# ------------------------------- END OF PATTERN-ACTION BLOCK -----------------------------------#
END {
# conditional required to avoid errors printed to STDERR when the help menu is printed
if( ARGC > 1 )
{
# Need to remove pre-existing "basename".tsv" file, as the script appends lines to it
# (print stuff >> gbk_tsv)
# Can use any of the following options:
# 1. call system()
# 2. pipe command to sh
# the second may by more portable and is used as a function call
#
#if(system("[ -s " gbk_tsv " ]") == 0) { system("rm "gbk_tsv) }
#OR:
#rm_if_exists="[ -s " gbk_tsv " ] && rm "gbk_tsv
#print rm_if_exists | "/bin/sh"
#close("/bin/sh")
rm_if_exists(gbk_tsv)
# print GBK table header for gbk_tsv
print "CDS_no\tLOCUS\tLOCUS_length\tlocus_tag\tgene_name\tprotein_id\tproduct_description\tstart\tend\tCDS_l\tprod_l\tcomplement\tpseudogene" > gbk_tsv
# if exists, rm the FASTA file holding CDSS, since the script appends to it
# (print stuff >> gbk_CDSs)
rm_if_exists(gbk_CDSs)
# loop over the CDSs_AoA array of arrays to print
# i) the GBK file in tabular format to file gbk_tsv
# ii) and the CDSs to STDOUT
for (k in CDSs_AoA)
{
# 1. print the GBK file in tabular format to file gbk_tsv; Nothe the use of an AoA as 1ary key to the dna AoA
printf "%d\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\n", k, CDSs_AoA[k]["LOCUS"], dna[CDSs_AoA[k]["LOCUS"]]["len"],
CDSs_AoA[k]["l"],CDSs_AoA[k]["gene"], CDSs_AoA[k]["pid"], CDSs_AoA[k]["p"], CDSs_AoA[k]["s"], CDSs_AoA[k]["e"],
CDSs_AoA[k]["e"] - CDSs_AoA[k]["s"] + 1 , (CDSs_AoA[k]["e"] - CDSs_AoA[k]["s"] +1 )/3,
CDSs_AoA[k]["c"], CDSs_AoA[k]["pseudo"] >> gbk_tsv
# 2. extract_sequence_by_coordinates, write CDSs.fna, translate CDSs and write proteome.faa; add gene name to product
extract_sequence_by_coordinates(CDSs_AoA[k]["l"], CDSs_AoA[k]["pid"], CDSs_AoA[k]["p"] " ["CDSs_AoA[k]["gene"]"]",
seq, CDSs_AoA[k]["s"], CDSs_AoA[k]["e"], CDSs_AoA[k]["c"], gbk_CDSs, CDSs_AoA[k]["pseudo"])
}
# report if the files were successfully written to disk
print_if_exists(gbk_tsv)
print_if_exists(gbk_CDSs)
# if exists, rm the FASTA file holding the DNA string, since the script appends to it
# (print stuff >> gbk_fsa)
rm_if_exists(gbk_fsa)
# print the complete DNA sequence strings for each LOCUS/record
# in the input GenBank to file gbk_fsa
for (h in dna)
{
printf ">%s\n%s\n", h, dna[h]["seq"] >> gbk_fsa
}
# report if the file was successfully written to disk
print_if_exists(gbk_fsa)
# translate fastas and print to disk
rm_if_exists(gbk_faa)
for (h in prots) {
# make sure we print only non-redundant sequences
hcount[h]++
# skip printing any pesudogene translations, which will be empty
if( h ~ / \[pseudogene\]$/ ) continue
printf(">%s\n%s\n", h, prots[h]) >> gbk_faa
}
# report if the file was successfully written to disk
print_if_exists(gbk_faa)
}
}
# ------------------------------------------- END BLOCK FINISHES ---------------------------------------- #
#=========================================================================================================#
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTION DEFINITIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #
#---------------------------------------------------------------------------------------------------------#
function rm_if_exists(file, cmd)
{
cmd="[ -s " file " ] && rm "file
#print cmd | "/bin/sh" # <-- symbolic link: bin/sh -> dash
#close("/bin/sh")
print cmd | "/usr/bin/env bash"
close("/usr/bin/env bash")
}
#---------------------------------------------------------------------------------------------------------
function print_if_exists(file, cmd)
{
cmd="[ -s " file " ] && echo '# wrote file '"file
print cmd | "/usr/bin/env bash"
close("/usr/bin/env bash")
}
#---------------------------------------------------------------------------------------------------------
function rev_string(seq, i,s)
{
for(i=length(seq); i != 0; i--)
{
s = s substr($0, i, 1)
}
return s
}
#---------------------------------------------------------------------------------------------------------
function rev_compl(lt, pi, pro, dna_seq, outfile, i, k, s, dna_header, prot_header) # reverse complement
{ # receives two arguments: the fasta header and the CDS sequence to be translated
i=k=s=""
dna_header = lt " " pro
prot_header = pi " " pro
dna_seq = toupper(dna_seq) # to match the keys of compl_nucl
for( i = length(dna_seq); i !=0; i-- ) { # note that the loop reads the sequence from end to beginnig
k = substr(dna_seq, i, 1)
s = s compl_nucl[k]
}
printf(">%s\n%s\n", dna_header, s) >> outfile
translate_dna(prot_header, s)
}
#---------------------------------------------------------------------------------------------------------
function extract_sequence_by_coordinates(ltg, pid, prod, dna_seq, start, end, revCompFlag, outfile, seq, dna_header, prot_header)
{
# This function extracts the CDS string using the caputred coordinates
# and appends the sequence to the output FASTA file, calling the auxiliary
# function rev_compl() if required.
# Both extract_sequence_by_coordinates() and rev_compl() call translate_dna()
# to translate the CDSs using the codons hash, saving the whole proteome in
# the prots hash
dna_seq = toupper(dna_seq)
dna_header = ltg " " prod
prot_header = pid " " prod
if(start == 1 && revCompFlag)
{
seq = substr(dna_seq, start, end)
rev_compl(ltg, pid, prod, seq, outfile)
}
if(start == 1 && ! revCompFlag)
{
print ">"dna_header"\n"substr(dna_seq, start, end) >> outfile
translate_dna(prot_header, substr(dna_seq, start, end))
}
if( start > 1 && ! revCompFlag )
{
print ">"dna_header"\n"substr(dna_seq, start, end-start+1) >> outfile
translate_dna(prot_header, substr(dna_seq, start, end-start+1))
}
if( start > 1 && revCompFlag )
{
seq = substr(dna_seq, start, end-start+1)
rev_compl(ltg, pid, prod, seq, outfile)
}
}
#---------------------------------------------------------------------------------------------------------
function translate_dna(header, dna_seq, s, i, p)
{ # receives two arguments: the fasta header and the CDS sequence to be translated
# Initialize variables:
# do-while loop control variable i (nt counter)
# and p, which will hold the translation product
{i=1; p=""; triplet_counter=0}
# Here we run a do-while loop; the do loop is a variation of the while looping statement.
# The do loop always executes the body once and then repeats the body as long as the condition is true
# We use the do-while loop, to get a first triplet string saved in s;
# then the while loop keeps going until substr() got the last triplet, resulting in an empty s="".
do {
# First check that the script got some input
# if not, exit with an error message
if(length(dna_seq) == 0) {
print "ERROR: need a DNA sequence string to translate (valid DNA sequence, divisible by 3)" > "/dev/stderr"
exit 1
# Check that the DNA sequence string is divisible by 3 using the modulo operator
# if not, exit with an error message
} else if(length(dna_seq)%3) {
printf("# WARNING: input DNA sequence for %s not divisible by 3. Will skip it!\n", header) > "/dev/stderr"
continue
}
# use substr() to split the input sequence (dna_seq) into triplets saved in s
s = substr(dna_seq, i, 3)
# make sure that input nt symbols are uppercase to match the hash keys
s=toupper(s)
# keep track of processed triplets (codons)
triplet_counter++
# check that the input corresponds to valid nucleotides
if ( s ~ /[^acgtACGT]+/ ) {
print "WARNING: input triplet", triplet_counter, "=", s,
"contains a non-valid nucleotide symbol ..." > "/dev/stderr"
break
}
# use the codon hash c as lookup table to translate the s triplet
# appending codons[s] to the growing peptide p
{
# break out of loop if we get no more triplets
# out of the input DNA string with substr()
if (codons[s]=="") {
break
}
else if (s in codons == 0) {
# if the triplet is not contained in c, append "X" to p
p = p unknown
}
else {
# append aminoacid codons[s] to growing peptide
p = p codons[s]
}
}
i=i+3 # increment the counter of processed dna nucleotides by 3
}
# run while loop until substring cannot retrieve any more triplets
while (s!="")
prots[header] = p
}
#---------------------------------------------------------------------------------------------------------
function print_help(prog, vers)
{
print "# AIMS:" > "/dev/stderr"
print "# 1. extracts the CDSs from a single GenBank file, saving them to genbank_basename_CDSs.fna" > "/dev/stderr"
print "# 2. translates the CDSs and writes them to file genbank_basename_proteome.faa" > "/dev/stderr"
print "# 3. writes the GenBank file in tabular format as genbank_basename.tsv" > "/dev/stderr"
print "# 4. extracts the complete DNA string(s) from the input GenBank, saving them as genbank_basename.fsa" > "/dev/stderr"
print "\n# NOTES:" > "/dev/stderr"
print "# 1.", prog, "v"vers, "does not deal with CDSs containing introns" > "/dev/stderr"
print "# CDSs containing the join statement: complement(join(4497616..4498557,4498557..4498814)) are skipped" > "/dev/stderr"
print "# Use only for bacterial|mitochondrial|plastid|yeast genomes" > "/dev/stderr"
print "\n# TODO:" > "/dev/stderr"
print "# 1. add code to process split (pseudo)genes as in complement(join(4497616..4498557,4498557..4498814))" > "/dev/stderr"
print "\n# USAGE of", prog, "v"vers":" > "/dev/stderr"
printf "\t%s %s\n", prog, "genbank_file.gbk" > "/dev/stderr"
print "\tOR" > "/dev/stderr"
printf "\tfor f in *.gbk; do %s %s\n\n", prog, "$f; done" > "/dev/stderr"
exit 0
}
#---------------------------------------------------------------------------------------------------------