-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
78 lines (58 loc) · 1.99 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# this "hack" is necessary for c_v algorithm to work right
if __name__ == '__main__':
from pymongo import MongoClient
from nlp.stopword import StopwordsRemover
from gensim import corpora
from gensim.models import LdaModel
from gensim.test.utils import datapath
from gensim.models.coherencemodel import CoherenceModel
client = MongoClient("mongodb+srv://lda-nlp:[email protected]/myFirstDatabase?retryWrites=true&w=majority")
db = client['topicmodeling']
collection = db['novinky']
cursor = collection.find({})
dictionary = {}
stopwords = StopwordsRemover()
docs = []
for document in cursor:
doc = []
for term in document["terms"]:
if not stopwords.is_stopword(term):
doc.append(term)
docs.append(doc)
dictionary = corpora.Dictionary(
docs
)
dictionary.save('./models/model4/dictionary.id2word')
corpus = [dictionary.doc2bow(text) for text in docs]
num_topics = 20
chunksize = 2000
passes = 1
iterations = 50
eval_every = None
temp = dictionary[0]
id2word = dictionary.id2token
print("Corpus size: " + str(len(docs)))
for num_topics in [14]:
model = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every,
random_state=31052021
)
model.save("models/model4/model")
print(model.print_topics())
measures = ['u_mass', 'c_v']
for measure in measures:
cm = CoherenceModel(model=model, corpus=corpus, coherence=measure, texts=docs, dictionary=dictionary)
coherence = cm.get_coherence()
print("---")
print(num_topics)
print(measure)
print(coherence)
print("---")