-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOntologyTagger.py
143 lines (128 loc) · 4.86 KB
/
OntologyTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import calendar
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore', category=FutureWarning,module='gensim')
from nltk.tokenize import *
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import pickle, OntologyTree, pprint, json
class OntologyTagger:
def __init__(self,email,model,tree,dir):
self.email = email
self.model = model
self.tree = tree
self.categ = []
self.dir = dir
def findTopicHeader(self):
email = self.email
header = self.email.header
topicPattern = re.compile(r'Topic[:]([\W\w].*)',flags=re.IGNORECASE)
topicLine = re.findall(topicPattern,(email.header))
if topicLine:
topic = topicLine[0]
return topic
else:
return None
def keyWordsInTopic(self):
try:
with open("nameList.pkl", 'rb') as f:
names = pickle.load(f)
except:
with open("nameList.pkl", 'wb') as f:
names = ['Type','Topic','Fwd','Lecture','Talk',"Series","Seminar","Seminars","Presentation","The"] + [x for x in calendar.month_name][1::]
pickle.dump(names, f)
newNames = []
topic = self.findTopicHeader()
if topic:
toReturn = self.keyWordBody(names, newNames,self.email.body,topic)
if toReturn:
return toReturn
return self.keyWordBody(names, newNames, self.email.header + self.email.body, None)
def keyWordBody(self, names, newNames, data, topicSent):
tokenisedData = word_tokenize(data)
lemmatizer = WordNetLemmatizer()
stopwordInTopic = stopwords.words('english')
tokenisedData = [lemmatizer.lemmatize(token) for token in tokenisedData]
tokenisedData = [token for token in tokenisedData if token not in stopwordInTopic and token.isalpha()]
picklePath = self.dir + "/pickled/"
pickleFileName = picklePath + str(self.email.fileID) + ".pickle"
try:
with open(pickleFileName, 'rb') as f:
classifiedData = pickle.load(f)
for i in classifiedData:
for entry in i:
if entry[1] in ['PERSON', 'LOCATION']:
if entry[0] in tokenisedData:
newNames.append(entry[0])
except:
pass
names = names + newNames
if topicSent:
tokenisedHeader = word_tokenize(topicSent)
else:
tokenisedHeader = None
for i in names:
if tokenisedHeader:
for x in tokenisedHeader:
if i.lower() == x.lower():
tokenisedHeader.remove(x)
for y in tokenisedData:
if i.lower() == y.lower():
tokenisedData.remove(y)
frequencies = Counter(tokenisedData)
likelyTopics = []
if tokenisedHeader:
headerKeywords = [word for word in tokenisedHeader if word in tokenisedData]
for i in frequencies.most_common():
if i[0] in headerKeywords and len(likelyTopics) < 5:
likelyTopics.append(i)
else:
likelyTopics = frequencies.most_common(4)
with open("nameList.pkl", 'wb') as f:
pickle.dump(names, f)
return likelyTopics
def findOntTreeMatch(self,keywords):
matches = {}
maxSimilarity = 0
maxTopic = 0
maxKeyword = ""
self.categ = []
self.traverseTree(self.tree)
for i in keywords:
for x in self.categ:
cate = word_tokenize(x)
for y in cate:
try:
s = self.model.similarity(i[0],y)
except KeyError:
s = 0
if s > maxSimilarity:
maxSimilarity = s
maxTopic = y
maxKeyword = i[0]
self.addFileToTreeCat(self.tree,maxTopic)
self.categ = []
return self.tree
def traverseTree(self,tr):
for k, v in tr.items():
if isinstance(v,list):
self.categ.append(k)
if isinstance(v, dict):
self.traverseTree(v)
else:
continue
def addFileToTreeCat(self,t,cat):
id = self.email.fileID
for k, v in t.items():
if k == cat:
if isinstance(v, list):
v.append(id)
if isinstance(v, dict):
self.addFileToTreeCat(v, cat)
else:
continue
def printTree(self,tree):
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(tree)
#print(json.dumps(tree,indent=4))