-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path11_gene_to_GO.py
198 lines (134 loc) · 6.71 KB
/
11_gene_to_GO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#####################################################################################################################################################################
#
# Script used in:
#
# "Gene expression profiling across ontogenetic stages in wood white (Leptidea sinapis) reveals pathways linked to butterfly diapause regulation."
# Luis Leal, Venkat Talla, Thomas Källman, Magne Friberg, Christer Wiklund, Vlad Dincă, Roger Vila, Niclas Backström
# Mol. Eco. (2018)
#
#
#####################################################################################################################################################################
#
# Luis Leal
# Uppsala University, Uppsala, Sweden, 2017
# Written in Python 3
#
#####################################################################################################################################################################
#####################################################################################################################################################################
#
# Script used to create two-column file that lists each Trinity gene and its associated GO terms
#
# Input files: -> trinotate-invertebrates-CONSOLIDATED.csv (consolidated trinotate file produced by 09_tximport_consolidateGenes.py)
# -> text file containing list of Trinity genes
#
# Output files: -> file listing genes and their associated GO terms ('OUTPUT_trinity_to_GO.txt')
# -> file listing GO codes and their associated genes ('OUTPUT_GO_to_trinity.txt')
# -> file associating transcripts to gene names ('OUTPUT_trinity_to_gene.txt')
#
#####################################################################################################################################################################
print('\n Gleaning GO terms ...')
######################################################### USAGE
error_message1 = ' \n \
USAGE: $python3 gene_to_GO.py <annotation_file.csv> <gene_list.txt>'
# Example:
# python3 gene_to_GO.py trinotate-invertebrates-CONSOLIDATED.csv genelist_test1.txt
######################################################### LOAD STANDARD MODULES
import sys
import re # 're' module provides regular expression matching operations
import ast # required to convert reference tree from string to list
import os # required to create new folders
import time
import csv #required to open csv file
######################################################## OPEN INPUT FILES
GENE_NAME_DIC = dict()
GENE_ANNOT_DIC = dict()
try:
with open(sys.argv[1], newline='') as csvfile1: # open consolidated Trinotate file
inputfile_1 = csv.reader(csvfile1, delimiter='\t')
for row in inputfile_1:
trinity_aux = row[0]
gene_name_aux = row[4]
annot_aux = row[17]
GENE_NAME_DIC[trinity_aux] = gene_name_aux #saves entry to dictionary (removing quotes)
GENE_ANNOT_DIC[trinity_aux] = annot_aux #saves entry to dictionary (removing quotes)
except:
print('\n Error[1]: input file missing.')
print(error_message1)
exit()
try:
inputfile_2 = open(sys.argv[2], 'r') # open file containing list of genes of interest
except:
print('\n Error[2]: input file missing.')
print(error_message1)
exit()
######################################################## PROCESS DATA
### store entries from file 2 as a list
goi_list = list()
for line in inputfile_2:
line = line.rstrip()
goi_list.append(line)
### Get GO terms associated to genes in gene list
GO_list = list()
for gene in goi_list :
aux_1 = GENE_ANNOT_DIC.get(gene ,0)
GO_list.append(aux_1)
#print(gene, aux_1)
#print(GO_list)
### Get gene name associated to genes in gene list
geneName_list = list()
for gene in goi_list :
aux_1 = GENE_NAME_DIC.get(gene ,0)
geneName_list.append(aux_1)
#print(gene, aux_1)
### get genes associated to each GO terms (for genes of interest only)
GO_DIC = dict() #dictionary: Go term and associated genes
GO_ind = list() #list of all GO terms
counter_1 = -1
for line in GO_list:
#print(line)
aux_list1 = list()
counter_1 += 1
GO_aux = ""
if line != '.' : # if GO term != '.' (empty)
# print(line + '\n')
for m in re.finditer('GO:', line): # find all GO terms in description
aux_list1.append(m.start())
# print(aux_list1)
for n in range(len(aux_list1)) : # for all GO terms in description, get GO term (based on index)
# print(line[aux_list1[n]])
if n != (len(aux_list1)-1) :
GO_aux = line[aux_list1[n]:(aux_list1[(n+1)]-1)]
else :
GO_aux = line[aux_list1[n]:]
if GO_aux != "" : # if GO term is not empty
GO_ind.append(GO_aux)
if GO_aux in GO_DIC : # if GO term already in dictionary
entry_aux = GO_DIC.get(GO_aux ,0)
if goi_list[counter_1] not in entry_aux : # make sure gene has not been associated yet to GO term in dictionary
entry_GO_aux = entry_aux + "," + goi_list[counter_1]
GO_DIC[GO_aux] = entry_GO_aux
else :
GO_DIC[GO_aux] = goi_list[counter_1]
GO_ind = sorted(list(set(GO_ind))) #remove duplicates and sort list of GO terms alphabetically
#print dictionary
#for term in GO_ind:
# print(term, GO_DIC[term])
######################################################## SAVE RESULTS TO FILE
outFileName1 = 'OUTPUT_trinity_to_GO.txt'
outFileName2 = 'OUTPUT_GO_to_trinity.txt'
outFileName3 = 'OUTPUT_trinity_to_gene.txt'
outfile1 = open(outFileName1, 'w')
outfile2 = open(outFileName2, 'w')
outfile3 = open(outFileName3, 'w')
for n in range(len(goi_list)) :
aux_save = goi_list[n] + '\t' + GO_list[n] + '\n'
outfile1.write(aux_save)
for term in GO_ind :
aux_save = term[0:10] + '\t' + GO_DIC[term] + '\n'
outfile2.write(aux_save)
for n in range(len(goi_list)) :
aux_save = goi_list[n] + '\t' + geneName_list[n] + '\n'
outfile3.write(aux_save)
outfile1.close()
outfile2.close()
outfile3.close()