-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinit.py
120 lines (89 loc) · 2.52 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from providers.novinkyjsonarticleprovider import NovinkyJsonArticleProvider
from nlp.nlp import NLP
from nlp.sentencesplitter import SentenceSplitter
from nlp.tokenizer import SentenceTokenizer
from wordcloud import WordCloud
from nlp.stopword import StopwordsRemover
from gensim import corpora
from gensim.models import LdaModel
import time
nlp = NLP()
tokenizer = SentenceTokenizer()
splitter = SentenceSplitter()
stopwords = StopwordsRemover()
provider = NovinkyJsonArticleProvider()
for article in provider.get_next_article():
print(article)
quit()
i = 0
docs = []
articleurls = []
for article in provider.get_next_article():
time.sleep(0.07)
print(article)
print(article.get_content())
content = article.get_content()
sentences = splitter.split(content)
doc = []
tf = {}
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens = stopwords.remove_stopwords(tokens)
for token in tokens:
tok = token.strip()
if tok == '' or len(tok) < 3:
continue
lemma = nlp.lemmatize_nv(tok)
if not stopwords.is_stopword(lemma):
if lemma not in tf:
tf[lemma] = 0
tf[lemma] = tf[lemma] + 1
if lemma != '' and len(lemma) > 3:
doc.append(lemma)
docs.append(doc)
articleurls.append(article.get_url())
i = i + 1
dictionary = corpora.Dictionary(
docs
)
corpus = [dictionary.doc2bow(text) for text in docs]
num_topics = 10
chunksize = 2000
passes = 100
iterations = 1000
eval_every = None
temp = dictionary[0] # This is only to "load" the dictionary.
id2word = dictionary.id2token
print("Corpus size: " + str(len(docs)))
model = LdaModel(
corpus=corpus,
id2word=id2word,
chunksize=chunksize,
alpha='auto',
eta='auto',
iterations=iterations,
num_topics=num_topics,
passes=passes,
eval_every=eval_every
)
print(model.print_topics())
for i in range(len(articleurls)):
print("----")
print(articleurls[i])
topics = model.get_document_topics(corpus[i])
for topic in topics:
print("Topic:" + str(topic[0]))
print(model.print_topic(topic[0]))
print(topic[1])
print("----")
#wordcloud = WordCloud(
# background_color="white",
# max_words=5000,
# contour_width=3,
# contour_color='steelblue',
# width=1600,
# height=800
#)
#
#wordcloud.generate_from_text(complete)
#wordcloud.to_file('wordcloud.png')