Skip to content

Commit f530f60

Browse files
committed
Word2Vec Classification (#24)
Word2Vec Embeddings for User Classification
1 parent e52e04e commit f530f60

File tree

140 files changed

+5314
-12
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+5314
-12
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ results/word2vec_embeddings/*.model
1919
results/word2vec_embeddings/*.kv
2020
results/word2vec_embeddings/*.csv
2121

22+
results/word2vec_classification/*/*/model.joblib
23+
results/word2vec_classification/*/*/*.csv
2224

2325
#results/*/*/*.png
2426
#results/*/*/*.html

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ BUCKET_NAME="my-bucket"
5353

5454
## Usage
5555

56-
### OpenAI Embeddings
56+
### OpenAI Service
5757

5858
Fetch some example embeddings from OpenAI API:
5959

@@ -74,11 +74,18 @@ python -m app.dataset
7474

7575
Perform machine learning and other analyses on the data:
7676

77+
OpenAI Embeddings:
78+
7779
+ [Dimensionality Reduction](app/reduction/README.md)
7880
+ [Clustering](app/clustering/README.md)
7981
+ [Classification](app/classification/README.md)
8082
+ [Reduced Classification](app/reduced_classification/README.md)
8183

84+
Word2Vec Embeddings:
85+
86+
+ [Dimensionality Reduction](app/word2vec_embeddings/README.md)
87+
+ [Classification](app/word2vec_classification/README.md)
88+
8289

8390
## Testing
8491

app/classification/logistic_regression.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
class LogisticRegressionPipeline(ClassificationPipeline):
1212

13-
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
14-
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
13+
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
14+
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
1515

1616
self.model = LogisticRegression(random_state=99) #multi_class="auto"
1717
self.model_dirname = "logistic_regression"

app/classification/pipeline.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
class ClassificationPipeline(ABC):
3434
"""Supports binary and multiclass classification."""
3535

36-
def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None):
36+
def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None, will_upload=True):
3737

3838
self.ds = ds or Dataset()
3939
self.x_scale = x_scale
@@ -69,6 +69,8 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
6969
self.k_folds = k_folds
7070
self._results_dirpath = results_dirpath
7171

72+
self.will_upload = bool(will_upload)
73+
7274
# values set after training:
7375
self.gs = None
7476
self.results = None
@@ -80,6 +82,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
8082
self.model_dirname = None
8183
self.param_grid = param_grid or {}
8284

85+
8386
@property
8487
def model_type(self):
8588
return self.model.__class__.__name__
@@ -97,12 +100,12 @@ def perform(self):
97100
self.plot_roc_curve()
98101

99102
# upload to cloud storage :-D
100-
self.storage = ModelStorage(local_dirpath=self.results_dirpath)
101-
self.storage.save_and_upload_model(self.model)
103+
if self.will_upload:
104+
self.storage = ModelStorage(local_dirpath=self.results_dirpath)
105+
self.storage.save_and_upload_model(self.model)
102106

103107

104108
def train_eval(self):
105-
106109
self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, shuffle=True, test_size=0.2, random_state=99)
107110
print("X TRAIN:", self.x_train.shape)
108111
print("Y TRAIN:", self.y_train.shape)

app/classification/random_forest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77

88
class RandomForestPipeline(ClassificationPipeline):
99

10-
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
11-
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
10+
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
11+
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
1212

1313
self.model = RandomForestClassifier(random_state=99)
1414
self.model_dirname = "random_forest"

app/classification/xgboost.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121

2222
class XGBoostPipeline(ClassificationPipeline):
2323

24-
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
25-
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
24+
def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
25+
super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
2626

2727
# UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
2828
# To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
## Word2Vec Classification
2+
3+
Save word2vec embeddings dataset with original user labels:
4+
5+
```sh
6+
python -m app.word2vec_classification.dataset
7+
```
8+
9+
Perform classification using the word2vec embeddings dataset:
10+
11+
```sh
12+
FIG_SAVE=true FIG_SHOW=false python -m app.word2vec_classification.job
13+
```
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
2+
import os
3+
from functools import cached_property
4+
from pandas import read_csv
5+
6+
from app import DATA_DIRPATH
7+
from app.dataset import Dataset
8+
from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
9+
10+
11+
WORD2VEC_EMBEDDINGS_CSV_FILEPATH = os.path.join(WORD2VEC_RESULTS_DIRPATH, "document_vectors.csv")
12+
13+
WORD2VEC_DATASET_PATH = os.path.join(DATA_DIRPATH, "word2vec", "botometer_sample_word2vec_embeddings_20230825.csv.gz")
14+
15+
class Word2VecDataset():
16+
17+
def __init__(self, force_recompile=False):
18+
19+
self.csv_filepath = WORD2VEC_DATASET_PATH
20+
21+
#super().__init__(csv_filepath=WORD2VEC_DATASET_PATH)
22+
23+
self.force_recompile = force_recompile
24+
25+
#self.title = f"Word2Vec Embeddings"
26+
27+
#breakpoint()
28+
#self.feature_cols = "TODO:" # feature_colnames(self.reducer_name, self.n_components)
29+
30+
31+
@cached_property
32+
def df(self):
33+
"""Override parent method, compile dataset from reduction results."""
34+
if os.path.isfile(self.csv_filepath) and not self.force_recompile:
35+
print("LOADING EXISTING DATASET FROM FILE...")
36+
return read_csv(self.csv_filepath)
37+
else:
38+
print("COMPILING DATASET FROM RESULTS FILES...")
39+
ds = Dataset()
40+
labels_df = ds.labels #[colname for colname in df.columns if not colname.isnumeric()]
41+
embeddings_df = read_csv(WORD2VEC_EMBEDDINGS_CSV_FILEPATH)
42+
df = labels_df.merge(embeddings_df, left_on="user_id", right_on="user_id")
43+
44+
# write dataset (for faster loading later):
45+
df.to_csv(self.csv_filepath, index=False)
46+
return df
47+
48+
49+
@cached_property
50+
def x(self):
51+
"""Override parent method, use feature cols specified below."""
52+
return self.df[self.feature_cols].copy()
53+
54+
@property
55+
def feature_cols(self):
56+
"""Features 0 through 99 (word2vec embeddings) """
57+
return [colname for colname in self.df.columns if colname.isnumeric()]
58+
59+
60+
#@property
61+
#def label_cols(self):
62+
# return [colname for colname in self.df.columns if not colname.isnumeric()]
63+
64+
65+
66+
if __name__ == "__main__":
67+
68+
69+
70+
ds = Word2VecDataset()
71+
72+
print(ds.df.head())

app/word2vec_classification/job.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import os
2+
3+
from app import RESULTS_DIRPATH
4+
from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
5+
from app.classification.logistic_regression import LogisticRegressionPipeline
6+
from app.classification.random_forest import RandomForestPipeline
7+
from app.classification.xgboost import XGBoostPipeline
8+
9+
from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
10+
from app.word2vec_classification.dataset import Word2VecDataset
11+
12+
13+
CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_classification")
14+
15+
16+
if __name__ == "__main__":
17+
18+
ds = Word2VecDataset()
19+
20+
will_upload = True
21+
for y_col in Y_COLS:
22+
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
23+
pipeline = LogisticRegressionPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
24+
pipeline.perform()
25+
26+
#continue
27+
28+
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
29+
pipeline = XGBoostPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
30+
pipeline.perform()
31+
32+
# the slowest can go last:
33+
results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
34+
pipeline = RandomForestPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
35+
pipeline.perform()

app/word2vec_embeddings/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ python -m app.word2vec_embeddings.pipeline
1111
# FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
1212
```
1313

14+
### Dimensionality Reduction
15+
1416
Perform dimensionality reduction on the resulting word and document embeddings, respectively:
1517

1618
```sh

0 commit comments

Comments
 (0)