s2t2
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 8 additions & 1 deletion b/‎README.md‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎app/classification/logistic_regression.py‎
Lines changed: 2 additions & 2 deletions b/‎app/classification/logistic_regression.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/classification/pipeline.py‎
Lines changed: 7 additions & 4 deletions b/‎app/classification/pipeline.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎app/classification/random_forest.py‎
Lines changed: 2 additions & 2 deletions b/‎app/classification/random_forest.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/classification/xgboost.py‎
Lines changed: 2 additions & 2 deletions b/‎app/classification/xgboost.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎app/word2vec_classification/README.md‎
Lines changed: 13 additions & 0 deletions b/‎app/word2vec_classification/README.md‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎app/word2vec_classification/dataset.py‎
Lines changed: 72 additions & 0 deletions b/‎app/word2vec_classification/dataset.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎app/word2vec_classification/job.py‎
Lines changed: 35 additions & 0 deletions b/‎app/word2vec_classification/job.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎app/word2vec_embeddings/README.md‎
Lines changed: 2 additions & 0 deletions b/‎app/word2vec_embeddings/README.md‎
Lines changed: 2 additions & 0 deletions
@@ -19,6 +19,8 @@ results/word2vec_embeddings/*.model
 results/word2vec_embeddings/*.kv
 results/word2vec_embeddings/*.csv
 
+results/word2vec_classification/*/*/model.joblib
+results/word2vec_classification/*/*/*.csv
 
 #results/*/*/*.png
 #results/*/*/*.html
 
@@ -53,7 +53,7 @@ BUCKET_NAME="my-bucket"
 
 ## Usage
 
-### OpenAI Embeddings
+### OpenAI Service
 
 Fetch some example embeddings from OpenAI API:
 
@@ -74,11 +74,18 @@ python -m app.dataset
 
 Perform machine learning and other analyses on the data:
 
+OpenAI Embeddings:
+
   + [Dimensionality Reduction](app/reduction/README.md)
   + [Clustering](app/clustering/README.md)
   + [Classification](app/classification/README.md)
   + [Reduced Classification](app/reduced_classification/README.md)
 
+Word2Vec Embeddings:
+
+  + [Dimensionality Reduction](app/word2vec_embeddings/README.md)
+  + [Classification](app/word2vec_classification/README.md)
+
 
 ## Testing
 
 
@@ -10,8 +10,8 @@
 
 class LogisticRegressionPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         self.model = LogisticRegression(random_state=99) #multi_class="auto"
         self.model_dirname = "logistic_regression"
 
@@ -33,7 +33,7 @@
 class ClassificationPipeline(ABC):
     """Supports binary and multiclass classification."""
 
-    def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None):
+    def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_folds=K_FOLDS, results_dirpath=None, will_upload=True):
 
         self.ds = ds or Dataset()
         self.x_scale = x_scale
@@ -69,6 +69,8 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
         self.k_folds = k_folds
         self._results_dirpath = results_dirpath
 
+        self.will_upload = bool(will_upload)
+
         # values set after training:
         self.gs = None
         self.results = None
@@ -80,6 +82,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
         self.model_dirname = None
         self.param_grid = param_grid or {}
 
+
     @property
     def model_type(self):
         return self.model.__class__.__name__
@@ -97,12 +100,12 @@ def perform(self):
             self.plot_roc_curve()
 
         # upload to cloud storage :-D
-        self.storage = ModelStorage(local_dirpath=self.results_dirpath)
-        self.storage.save_and_upload_model(self.model)
+        if self.will_upload:
+            self.storage = ModelStorage(local_dirpath=self.results_dirpath)
+            self.storage.save_and_upload_model(self.model)
 
 
     def train_eval(self):
-
         self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, shuffle=True, test_size=0.2, random_state=99)
         print("X TRAIN:", self.x_train.shape)
         print("Y TRAIN:", self.y_train.shape)
 
@@ -7,8 +7,8 @@
 
 class RandomForestPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         self.model = RandomForestClassifier(random_state=99)
         self.model_dirname = "random_forest"
 
@@ -21,8 +21,8 @@
 
 class XGBoostPipeline(ClassificationPipeline):
 
-    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None):
-        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath)
+    def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=None, will_upload=False):
+        super().__init__(ds=ds, y_col=y_col, param_grid=param_grid, results_dirpath=results_dirpath, will_upload=will_upload)
 
         # UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release.
         # To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
 
@@ -0,0 +1,13 @@
+## Word2Vec Classification
+
+Save word2vec embeddings dataset with original user labels:
+
+```sh
+python -m app.word2vec_classification.dataset
+```
+
+Perform classification using the word2vec embeddings dataset:
+
+```sh
+FIG_SAVE=true FIG_SHOW=false python -m app.word2vec_classification.job
+```
@@ -0,0 +1,72 @@
+
+import os
+from functools import cached_property
+from pandas import read_csv
+
+from app import DATA_DIRPATH
+from app.dataset import Dataset
+from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
+
+
+WORD2VEC_EMBEDDINGS_CSV_FILEPATH = os.path.join(WORD2VEC_RESULTS_DIRPATH, "document_vectors.csv")
+
+WORD2VEC_DATASET_PATH = os.path.join(DATA_DIRPATH, "word2vec", "botometer_sample_word2vec_embeddings_20230825.csv.gz")
+
+class Word2VecDataset():
+
+    def __init__(self, force_recompile=False):
+
+        self.csv_filepath = WORD2VEC_DATASET_PATH
+
+        #super().__init__(csv_filepath=WORD2VEC_DATASET_PATH)
+
+        self.force_recompile = force_recompile
+
+        #self.title = f"Word2Vec Embeddings"
+
+        #breakpoint()
+        #self.feature_cols = "TODO:" # feature_colnames(self.reducer_name, self.n_components)
+
+
+    @cached_property
+    def df(self):
+        """Override parent method, compile dataset from reduction results."""
+        if os.path.isfile(self.csv_filepath) and not self.force_recompile:
+            print("LOADING EXISTING DATASET FROM FILE...")
+            return read_csv(self.csv_filepath)
+        else:
+            print("COMPILING DATASET FROM RESULTS FILES...")
+            ds = Dataset()
+            labels_df = ds.labels #[colname for colname in  df.columns if not colname.isnumeric()]
+            embeddings_df = read_csv(WORD2VEC_EMBEDDINGS_CSV_FILEPATH)
+            df = labels_df.merge(embeddings_df, left_on="user_id", right_on="user_id")
+
+            # write dataset (for faster loading later):
+            df.to_csv(self.csv_filepath, index=False)
+            return df
+
+
+    @cached_property
+    def x(self):
+        """Override parent method, use feature cols specified below."""
+        return self.df[self.feature_cols].copy()
+
+    @property
+    def feature_cols(self):
+        """Features 0 through 99 (word2vec embeddings) """
+        return [colname for colname in  self.df.columns if colname.isnumeric()]
+
+
+    #@property
+    #def label_cols(self):
+    #    return [colname for colname in  self.df.columns if not colname.isnumeric()]
+
+
+
+if __name__ == "__main__":
+
+
+
+    ds = Word2VecDataset()
+
+    print(ds.df.head())
@@ -0,0 +1,35 @@
+import os
+
+from app import RESULTS_DIRPATH
+from app.classification import Y_COLS, Y_COLS_BINARY, Y_COLS_MULTICLASS
+from app.classification.logistic_regression import LogisticRegressionPipeline
+from app.classification.random_forest import RandomForestPipeline
+from app.classification.xgboost import XGBoostPipeline
+
+from app.word2vec_embeddings.pipeline import WORD2VEC_RESULTS_DIRPATH
+from app.word2vec_classification.dataset import Word2VecDataset
+
+
+CLASSIFICATION_RESULTS_DIRPATH = os.path.join(RESULTS_DIRPATH, "word2vec_classification")
+
+
+if __name__ == "__main__":
+
+    ds = Word2VecDataset()
+
+    will_upload = True
+    for y_col in Y_COLS:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "logistic_regression")
+        pipeline = LogisticRegressionPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
+
+        #continue
+
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "xgboost")
+        pipeline = XGBoostPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
+
+        # the slowest can go last:
+        results_dirpath = os.path.join(CLASSIFICATION_RESULTS_DIRPATH, y_col, "random_forest")
+        pipeline = RandomForestPipeline(ds=ds, y_col=y_col, results_dirpath=results_dirpath, will_upload=will_upload)
+        pipeline.perform()
@@ -11,6 +11,8 @@ python -m app.word2vec_embeddings.pipeline
 # FIG_SAVE=true FIG_SHOW=true python -m app.word2vec_embeddings.pipeline
 ```
 
+### Dimensionality Reduction
+
 Perform dimensionality reduction on the resulting word and document embeddings, respectively:
 
 ```sh