-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodule_LDA-topic-modeling.py
106 lines (80 loc) · 3.5 KB
/
module_LDA-topic-modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Source: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
import pandas as pd
import gensim
import numpy as np
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import PorterStemmer
from nltk.stem.porter import *
np.random.seed(2018)
from gensim import corpora, models
import nltk
nltk.download('wordnet')
#%% Load dataset
data = pd.read_csv(r"C:\Users\vinee\OneDrive - Massachusetts Institute of Technology\MIT\Fall 2020\6.867\Project\amzn_text_embs_per_art.csv",usecols = ["title","text"])
data['index'] = data.index
#%% Pre-processing text
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize_stemming(token))
return result
#%% Preview sample article text after preprocessing
stemmer = SnowballStemmer("english")
doc_sample = data.iloc[0][1]
print('original document: ')
words = []
for word in doc_sample.split(' '):
words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))
#%% Preprocess the text of all the articles
processed_articles = data['text'].map(preprocess)
processed_articles.head
#%% Preprocess all titles
processed_titles = data['title'].map(preprocess)
processed_titles.head
#%% TEXT
# Bag of words on the data set - Dictionary of 10 most common words
dictionary = gensim.corpora.Dictionary(processed_articles)
count = 0
for k, v in dictionary.iteritems():
print(k, v)
count += 1
if count > 10:
break
#%% Filter out tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
#%% Gensim doc2bow
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_articles]
#%% TF-IDF
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
#%% Running LDA using BOW
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=4)
#%% Words occuring in each topic and their relative weights
for idx, topic in lda_model.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
#%% Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
#%%
for idx, topic in lda_model_tfidf.print_topics(-1):
print('Topic: {} \nWords: {}'.format(idx, topic))
#%% Performance evaluation by classifying sample document using LDA Bag of Words model
train_index = 0
processed_articles[0]
for index, score in sorted(lda_model[bow_corpus[train_index]], key=lambda tup: -1*tup[1]):
print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))
#%% Performance evaluation by classifying sample document using LDA TF-IDF model.
for index, score in sorted(lda_model_tfidf[bow_corpus[train_index]], key=lambda tup: -1*tup[1]):
print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))
#%% Testing model on unseen document
unseen_document = 'Amazon sees record Cyber Monday sales'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))