-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_text.py
57 lines (45 loc) · 1.54 KB
/
classify_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from pymongo import MongoClient
from gensim import corpora
from gensim.models import LdaModel
from gensim.test.utils import datapath
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from nlp.nlp import NLP
from nlp.sentencesplitter import SentenceSplitter
from nlp.tokenizer import SentenceTokenizer
from wordcloud import WordCloud
from nlp.stopword import StopwordsRemover
import fileinput
from os import listdir
from os.path import isfile, join
lda = LdaModel.load("./models/model3/model")
dictionary = Dictionary.load('./models/model4/dictionary.id2word')
tokenizer = SentenceTokenizer()
splitter = SentenceSplitter()
stopwords = StopwordsRemover()
nlp = NLP()
text = ""
directory = "./test/"
def classify_document(text):
sentences = splitter.split(text)
doc = [] #bow
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens = stopwords.remove_stopwords(tokens)
for token in tokens:
tok = token.strip()
if tok == '' or len(tok) < 3:
continue
lemma = nlp.lemmatize_nv(tok)
if not stopwords.is_stopword(lemma):
if lemma != '' and len(lemma) > 3:
doc.append(lemma)
bow = dictionary.doc2bow(doc)
return lda[bow]
files = [f for f in listdir(directory) if isfile(join(directory, f))]
for f in files:
with open(directory + f, 'r', encoding="utf-8") as doc:
text = doc.read()
print(f)
topics = classify_document(text)
print(topics)