-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport.py
60 lines (45 loc) · 1.71 KB
/
import.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from pymongo import MongoClient
from providers.novinkyjsonarticleprovider import NovinkyJsonArticleProvider
from nlp.nlp import NLP
from nlp.sentencesplitter import SentenceSplitter
from nlp.tokenizer import SentenceTokenizer
from nlp.stopword import StopwordsRemover
import datetime
import time
client = MongoClient("mongodb+srv://lda-nlp:[email protected]/myFirstDatabase?retryWrites=true&w=majority")
db = client['topicmodeling']
collection = db['novinky']
nlp = NLP()
tokenizer = SentenceTokenizer()
splitter = SentenceSplitter()
stopwords = StopwordsRemover()
provider = NovinkyJsonArticleProvider(40348699)
i = 0
for article in provider.get_next_article():
if collection.find_one({"_id": article.get_id()}):
continue
content = article.get_content()
sentences = splitter.split(content)
doc = []
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
tokens = stopwords.remove_stopwords(tokens)
for token in tokens:
tok = token.strip()
if tok == '' or len(tok) < 3:
continue
lemma = nlp.lemmatize_nv(tok)
if not stopwords.is_stopword(lemma):
if lemma != '' and len(lemma) > 3:
doc.append(lemma)
i = i + 1
post = {
"_id": article.get_id(), # ID článku
"title": article.get_name(), # titulek článku
"summary": article.get_summary(), # perex článku - využitý jako "sumarizace" při výpisu
"url": article.get_url(), # URL článku
"terms": doc, # pole termů
"date": datetime.datetime.utcnow() # čas vytvoření záznamu
}
collection.insert_one(post)
time.sleep(1.00)