Skip to content

Commit e52e04e

Browse files
authored
Word2Vec Embeddings (#23)
Word Embeddings and User Embeddings
1 parent a164ea8 commit e52e04e

File tree

110 files changed

+128272
-9
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+128272
-9
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,11 @@ results/*/*.csv.gz
1515
results/*/*.json
1616
results/*/*.json
1717

18+
results/word2vec_embeddings/*.model
19+
results/word2vec_embeddings/*.kv
20+
results/word2vec_embeddings/*.csv
21+
22+
1823
#results/*/*/*.png
1924
#results/*/*/*.html
2025
results/*/*/*.json

app/reduction/pipeline.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,9 @@ def perform(self):
9090

9191
self.embeddings = self.reducer.fit_transform(self.x)
9292
print("EMBEDDINGS:", self.embeddings.shape)
93-
self.embeddings_df = DataFrame(self.embeddings, columns=self.component_names)
94-
self.embeddings_df = self.embeddings_df.merge(self.labels_df, left_index=True, right_index=True)
93+
self.embeddings_df = DataFrame(self.embeddings, columns=self.component_names, index=self.x.index)
94+
if isinstance(self.labels_df, DataFrame):
95+
self.embeddings_df = self.embeddings_df.merge(self.labels_df, left_index=True, right_index=True)
9596

9697
# EXPLAINABILITY:
9798
if self.reducer_type == "PCA":
@@ -101,7 +102,7 @@ def perform(self):
101102
self.loadings = self.reducer.components_.T * np.sqrt(self.reducer.explained_variance_)
102103
print("LOADINGS...", self.loadings.shape)
103104
self.loadings_df = DataFrame(self.loadings, columns=self.component_names)
104-
self.loadings_df.index = self.reducer.feature_names_in_
105+
self.loadings_df.index = self.x.columns.tolist() # self.reducer.feature_names_in_
105106

106107
# these represent the absolute magnitude of importances, not direction up or down
107108
self.feature_importances = {}
@@ -127,24 +128,32 @@ def save_embeddings(self):
127128
results_df.to_csv(csv_filepath, index=False)
128129

129130

130-
def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, subtitle=None, color=None, color_map=None, category_orders=None, hover_data=None, results_dirpath=None):
131+
def plot_embeddings(self, height=500, fig_show=FIG_SHOW, fig_save=FIG_SAVE, results_dirpath=None,
132+
subtitle=None, text=None, size=None, hover_data=None,
133+
color=None, color_map=None, color_scale=None, category_orders=None):
131134
title = f"Dimensionality Reduction Results ({self.reducer_type} n_components={self.n_components})"
132135
if subtitle:
133136
title += f"<br><sup>{subtitle}</sup>"
134137

135138
chart_params = dict(x="component_1", y="component_2",
136139
title=title, height=height,
137140
#color=color, #"artist_name",
138-
#hover_data=hover_data #["audio_filename", "track_number"]
141+
hover_data= hover_data #{"index": (self.embeddings_df.index)} #hover_data #["audio_filename", "track_number"]
139142
)
140143
if color:
141144
chart_params["color"] = color
142145
if color_map:
143146
chart_params["color_discrete_map"] = color_map
147+
if color_scale:
148+
chart_params["color_continuous_scale"] = color_scale
144149
if category_orders:
145150
chart_params["category_orders"] = category_orders
146151
if hover_data:
147152
chart_params["hover_data"] = hover_data
153+
if size:
154+
chart_params["size"] = size
155+
if text:
156+
chart_params["text"] = text
148157

149158
if self.n_components == 2:
150159
fig = px.scatter(self.embeddings_df, **chart_params)

app/word2vec_embeddings/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
## Word2Vec
2+
3+
4+
### Text Embeddings
5+
6+
```sh
7+
python -m app.word2vec_embeddings.pipeline
8+
9+
# WORD2VEC_DESTRUCTIVE=true python -m app.word2vec_embeddings.pipeline
10+
11+
# FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
12+
```
13+
14+
Perform dimensionality reduction on the resulting word and document embeddings, respectively:
15+
16+
```sh
17+
python -m app.word2vec_embeddings.reduction
18+
19+
# FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.reduction
20+
```
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
2+
import os
3+
import shutil
4+
from functools import cached_property
5+
from pprint import pprint
6+
7+
#from datetime import datetime
8+
from itertools import chain
9+
from collections import Counter
10+
11+
from gensim.models import Word2Vec
12+
from gensim.utils import simple_preprocess as tokenizer
13+
from pandas import DataFrame, Series
14+
import numpy as np
15+
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as SKLEARN_STOPWORDS
16+
17+
from app import RESULTS_DIRPATH
18+
from app.classification import Y_COLS
19+
20+
WORD2VEC_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_embeddings")
21+
#WORD2VEC_DESTRUCTIVE = bool(os.getenv("WORD2VEC_DESTRUCTIVE", default="false") == 'true')
22+
23+
#VECTOR_LENGTH = 100
24+
25+
26+
class WordPipe:
27+
def __init__(self, corpus, tokenizer=tokenizer, results_dirpath=WORD2VEC_RESULTS_DIRPATH, stopwords=SKLEARN_STOPWORDS): # destructive=WORD2VEC_DESTRUCTIVE
28+
"""Param corpus a pandas series of arrays (tokens for each document)"""
29+
30+
self.corpus = corpus
31+
self.tokenizer = tokenizer
32+
self.stopwords = stopwords
33+
34+
#self.destructive = bool(destructive)
35+
self.results_dirpath = results_dirpath
36+
self.model_filepath = os.path.join(self.results_dirpath, f"w2v.model")
37+
#self.kv_filepath = os.path.join(self.results_dirpath, f"w2v.kv")
38+
self.word_vectors_csv_filepath = os.path.join(self.results_dirpath, "word_vectors.csv")
39+
self.document_vectors_csv_filepath = os.path.join(self.results_dirpath, "document_vectors.csv")
40+
41+
42+
@cached_property
43+
def corpus_tokens(self):
44+
return self.corpus.apply(tokenizer)
45+
46+
@cached_property
47+
def word_counts(self):
48+
all_words = list(chain.from_iterable(self.corpus_tokens)) # h/t chat gpt for this one
49+
word_counter = Counter(all_words)
50+
return Series(word_counter.values(), index=word_counter.keys(), name="word_count")
51+
52+
53+
def perform(self):
54+
# TOKEN ANALYSIS (SIDE qUEST)
55+
print(self.word_counts.sort_values(ascending=False).head())
56+
57+
self.load_or_train_model()
58+
print("WORDS:", len(self.words))
59+
60+
print("WORD VECTORS:", self.word_vectors_df.shape) # 100 columns, default vector_size=100
61+
self.save_word_vectors()
62+
63+
print("DOCUMENT VECTORS:", self.document_vectors.shape)
64+
self.save_document_vectors()
65+
66+
67+
def load_or_train_model(self, vector_size=100, window=10, min_count=2, workers=4):
68+
#if self.destructive:
69+
# print("----------------")
70+
# print("DESTRUCTIVE MODE...")
71+
# #shutil.rmtree(self.results_dirpath)
72+
# os.removedirs()
73+
74+
os.makedirs(self.results_dirpath, exist_ok=True)
75+
76+
if os.path.exists(self.model_filepath):
77+
print("----------------")
78+
print("LOADING MODEL FROM FILE...")
79+
print(self.model_filepath)
80+
self.model = Word2Vec.load(self.model_filepath)
81+
print(self.model)
82+
#print(type(self.model))
83+
else:
84+
print("----------------")
85+
print("INITIALIZING NEW MODEL...")
86+
self.model = Word2Vec(window=window, min_count=min_count, workers=workers, vector_size=vector_size)
87+
print(self.model)
88+
89+
print("----------------")
90+
print("VOCAB...")
91+
self.model.build_vocab(self.corpus_tokens) # progress_per=1000
92+
#print("N SAMPLES:", model.corpus_count)
93+
#print("EPOCHS:", model.epochs)
94+
95+
print("----------------")
96+
print("TRAINING...")
97+
self.model.train(self.corpus_tokens, total_examples=self.model.corpus_count, epochs=self.model.epochs)
98+
print(round(self.model.total_train_time, 0), "seconds")
99+
100+
print("----------------")
101+
print("SAVING...")
102+
self.model.save(self.model_filepath)
103+
#self.model.wv.save(self.vectors_filepath)
104+
105+
return self.model
106+
107+
# AVAILABLE AFTER TRAINING:
108+
109+
# WORD ANaLYSIS
110+
111+
@property
112+
def words(self):
113+
return self.model.wv.index_to_key
114+
115+
@property
116+
def word_vectors(self):
117+
return self.model.wv.vectors
118+
119+
@property
120+
def word_vectors_df(self):
121+
return DataFrame(self.word_vectors, index=self.words)
122+
123+
@cached_property
124+
def words_df(self):
125+
words_df = self.word_vectors_df.merge(self.word_counts, how="inner", left_index=True, right_index=True)
126+
words_df["is_stopword"] = words_df.index.map(lambda token: token in self.stopwords)
127+
words_df.index.name = "token"
128+
return words_df
129+
130+
def save_word_vectors(self):
131+
self.words_df.to_csv(self.word_vectors_csv_filepath, index=True)
132+
133+
# DOCUMENT ANALYSIS
134+
135+
def infer_document_vector(self, tokens):
136+
""""Gets average vector for each set of tokens."""
137+
# Filter tokens that are in the model's vocabulary
138+
tokens = [token for token in tokens if token in self.model.wv.key_to_index]
139+
if any(tokens):
140+
# Calculate the average vector for the tokens in the document
141+
doc_vector = np.mean([self.model.wv[token] for token in tokens], axis=0)
142+
else:
143+
# If none of the tokens are in the model's vocabulary, return a zero vector
144+
doc_vector = np.zeros(self.model.vector_size)
145+
return doc_vector
146+
147+
@cached_property
148+
def document_vectors(self):
149+
return self.corpus_tokens.apply(self.infer_document_vector)
150+
151+
@cached_property
152+
def document_vectors_df(self, index_name="user_id"):
153+
# UNpacK EMBEdDINGS tO THEIR OWN COLUMNS
154+
docs_df = DataFrame(self.document_vectors.values.tolist())
155+
docs_df.columns = [str(i) for i in range(0, len(docs_df.columns))]
156+
docs_df.index = self.corpus_tokens.index
157+
docs_df.index.name = index_name
158+
return docs_df
159+
160+
def save_document_vectors(self):
161+
self.document_vectors_df.to_csv(self.document_vectors_csv_filepath, index=True)
162+
163+
164+
if __name__ == "__main__":
165+
166+
167+
from app.dataset import Dataset
168+
169+
ds = Dataset()
170+
df = ds.df
171+
172+
#df["tokens"] = df["tweet_texts"].apply(tokenizer)
173+
#print(df["tokens"].head())
174+
175+
wp = WordPipe(corpus=df["tweet_texts"])
176+
wp.perform()
177+
178+
# INVEstIGatION
179+
# https://radimrehurek.com/gensim/models/keyedvectors.html
180+
wv = wp.model.wv #> gensim.models.keyedvectors.KeyedVectors
181+
print(len(wv)) #> 34,729 ORIGINAL ( ______ STOPwORD-REMOVED)
182+
183+
#breakpoint()
184+
trumplike = wv.most_similar("realdonaldtrump", topn=10)
185+
pprint(trumplike)
186+
187+
#wv.similarity(w1="impeachment", w2="sham")
188+
#wv.similarity(w1="impeachment", w2="just"))
189+
#wv.similarity(w1="impeachment", w2="fair"))
190+
#wv.similarity(w1="impeachment", w2="unfair"))
191+
#wv.similarity(w1="realdonaldtrump", w2="guilty"))
192+
#wv.similarity(w1="realdonaldtrump", w2="innocent"))

0 commit comments

Comments
 (0)