s2t2
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 3 deletions b/‎.gitignore‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 25 additions & 6 deletions b/‎README.md‎
Lines changed: 25 additions & 6 deletions
diff --git a/‎app/classification/logistic_regression.py‎
Lines changed: 14 additions & 10 deletions b/‎app/classification/logistic_regression.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎app/classification/pipeline.py‎
Lines changed: 12 additions & 8 deletions b/‎app/classification/pipeline.py‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎app/classification/random_forest.py‎
Lines changed: 13 additions & 13 deletions b/‎app/classification/random_forest.py‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎app/classification/xgboost.py‎
Lines changed: 7 additions & 3 deletions b/‎app/classification/xgboost.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎app/model_storage.py‎
Lines changed: 115 additions & 0 deletions b/‎app/model_storage.py‎
Lines changed: 115 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 5 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎results/classification/fourway_label/logistic_regression/confusion.html‎
Lines changed: 1 addition & 1 deletion b/‎results/classification/fourway_label/logistic_regression/confusion.html‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎results/classification/fourway_label/logistic_regression/confusion.png‎
1 Byte b/‎results/classification/fourway_label/logistic_regression/confusion.png‎
1 Byte
@@ -12,14 +12,13 @@ data/*/*.csv.gz
 results/*/*.csv
 results/*/*.csv.gz
 results/*/*.json
+results/*/*.json
 
 #results/*/*/*.png
 #results/*/*/*.html
 results/*/*/*.json
 
-#results/*/*/*/*.png
-#results/*/*/*/*.html
-#results/*/*/*/*.json
+results/classification/*/*/model.joblib
 
 # ignore these files b/c they contains user ids:
 results/classification/*/*/predictions.csv
 
@@ -1,16 +1,15 @@
 # openai-embeddings-2023
 
-Dimensionality Reduction on Twitter Data using OpenAI Embeddings
+OpenAI Text Embeddings for User Classification in Social Networks
 
-## Research Questions
-
-Can we use ChatGPT's embeddings to reproduce our previous research?
-
-Can ChatGPT discern bot status, political sentiment, and q-anon support, based on user profiles and tweets?
+  + [Results Website](https://s2t2.github.io/openai-embeddings-2023/index.html)
+  + [Conference Talk (INFORMS 2023)](https://www.youtube.com/watch?v=AmF-5D4p1_4)
 
 
 ## Setup
 
+### Virtual Environment
+
 Create and/or activate virtual environment:
 
 ```sh
@@ -24,8 +23,28 @@ Install package dependencies:
 pip install -r requirements.txt
 ```
 
+### Users Sample
+
 Obtain a copy of the "botometer_sample_openai_tweet_embeddings_20230724.csv.gz" CSV file, and store it in the "data/text-embedding-ada-002" directory in this repo. This file was generated by the [notebooks](/notebooks/README.md), and is ignored from version control because it contains user identifiers.
 
+### Cloud Storage
+
+We are saving trained models to Google Cloud Storage. You will need to create a project on Google Cloud, and enable the Cloud Storage API as necessary. Then create a service account and download the service account JSON credentials file, and store it in the root directory, called "google-credentials.json". This file has been ignored from version control.
+
+From the cloud storage console, create a new bucket, and note its name (i.e. `BUCKET_NAME`).
+
+### Environment Variables
+
+Create a local ".env" file and add contents like the following:
+
+```sh
+# this is the ".env" file...
+
+GOOGLE_APPLICATION_CREDENTIALS="/path/to/openai-embeddings-2023/google-credentials.json"
+BUCKET_NAME="my-bucket"
+
+```
+
 ## Usage
 
 ### Dataset Loading
 
@@ -22,18 +22,22 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
 
             # C (float), default=1.0
             # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
-            #"classifier__C": [0.5, 1, 2, 10, 100],
+            "classifier__C": [
+                #0.1,
+                #0.5,
+                1, 2, 5,
+                10, 25, 50,
+                100
+            ],
 
             # default max_iter is 100
-            "classifier__max_iter": [#5, #15,
-                                    #20,
-                                    25, #30, #35,
-                                    #50,
-                                    #100, #250,
-                                    #500,
-                                    1_000,
-                                    10_000
-                            ],
+            "classifier__max_iter": [10, 25,
+                                     50,
+                                     100,
+                                     250,
+                                     500, 1_000, 5_000, 10_000
+                                     ],
+
             #"classifier__solver": ["liblinear", "newton-cg", "lbfgs", "sag", "saga"],
         }
 
 
@@ -6,27 +6,28 @@
 
 import numpy as np
 from pandas import Series, DataFrame
-from sklearn.model_selection import GridSearchCV, train_test_split
-from sklearn.pipeline import Pipeline
-#import matplotlib.pyplot as plt
 import plotly.express as px
 import plotly.graph_objs as go
 
+from sklearn.preprocessing import LabelBinarizer, LabelEncoder
+from sklearn.model_selection import GridSearchCV, train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import roc_curve, auc
+
+from app.colors import ORANGES
 from app.dataset import Dataset
+from app.model_storage import ModelStorage
 from app.classification import CLASSIFICATION_RESULTS_DIRPATH, save_results_json, class_labels
 from app.classification.results import ClassificationResults
 
+
 K_FOLDS = int(os.getenv("K_FOLDS", default="5"))
 #X_SCALE = bool(os.getenv("X_SCALE", default="false").lower() == "true")
 #SCALER_TYPE = os.getenv("SCALER_TYPE")
 
 FIG_SHOW = bool(os.getenv("FIG_SHOW", default="false").lower() == "true")
 FIG_SAVE = bool(os.getenv("FIG_SAVE", default="true").lower() == "true")
 
-from sklearn.metrics import roc_curve, roc_auc_score, auc
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import label_binarize, LabelBinarizer, LabelEncoder
-from app.colors import ORANGES
 
 
 class ClassificationPipeline(ABC):
@@ -72,6 +73,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
         self.gs = None
         self.results = None
         self.results_json = {}
+        self.storage = None
 
         # set in child class:
         self.model = None
@@ -94,7 +96,9 @@ def perform(self):
         else:
             self.plot_roc_curve()
 
-        #self.save_and_upload_model()
+        # upload to cloud storage :-D
+        self.storage = ModelStorage(local_dirpath=self.results_dirpath)
+        self.storage.save_and_upload_model(self.model)
 
 
     def train_eval(self):
 
@@ -16,12 +16,14 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
         self.param_grid = param_grid or {
 
             # default=100
-            "classifier__n_estimators": [50, 100, 150, 250],
+            "classifier__n_estimators": [50,
+                                         100, 150, 250, 500
+                                         ],
 
             # criterion {"gini", "entropy", "log_loss"}, default="gini"
             # ... The function to measure the quality of a split.
             # ... "gini" for Gini impurity, "log_loss" / "entropy" for Shannon information gain
-            "classifier__criterion": ["gini", "log_loss"],
+            #"classifier__criterion": ["gini", "log_loss"],
 
             # min_samples_split (int or float), default=2
             #The minimum number (or percentage) of samples required to split an internal node
@@ -31,12 +33,12 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
             #... The minimum number of samples required to be at a leaf node.
             # ... A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches.
             # ... This may have the effect of smoothing the model, especially in regression.
-            "classifier__min_samples_leaf": (1,  5,
-                                             10, 25,
-                                             50, #75,
-                                             #90,
-                                             100, #110, #125, 150
-                                             ),
+            #"classifier__min_samples_leaf": (1,  5,
+            #                                 10, 25,
+            #                                 50, #75,
+            #                                 #90,
+            #                                 100, #110, #125, 150
+            #                                 ),
 
             # max_depth (int), default=None
             # ... The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
@@ -78,9 +80,7 @@ def coefs(self):
 
     for y_col in Y_COLS:
 
-        pipeline = RandomForestPipeline(ds=ds, y_col=y_col, param_grid={
-            "classifier__criterion": ["gini"],
-            "classifier__min_samples_leaf": [3, 5, 8],
-            "classifier__n_estimators": [250, 500, 1000],
-        })
+        pipeline = RandomForestPipeline(ds=ds, y_col=y_col)
         pipeline.perform()
+
+        #breakpoint()
@@ -35,10 +35,10 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
         self.param_grid = param_grid or {
 
             # n_estimators (Optional[int]) – Number of boosting rounds.
-
+            'classifier__n_estimators': [ #50,
+                                         100, 150, 250, 500],
 
             # max_depth (Optional[int]) – Maximum tree depth for base learners.
-            #"max_depth": [2, 4, 8, 16],
 
             # max_leaves – Maximum number of leaves; 0 indicates no limit.
             #"max_leaves": [0, 2, 4, 8, 16]
@@ -48,6 +48,8 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
             # grow_policy – Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. 1: favor splitting at nodes with highest loss change.
             #
             # learning_rate (Optional[float]) – Boosting learning rate (xgb’s “eta”)
+            #'classifier__learning_rate': [0.1, 0.2, 0.3, 0.5],
+
             #
             # verbosity (Optional[int]) – The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
             #
@@ -133,7 +135,7 @@ def explainability_json(self):
 
     @property
     def coefs(self):
-        """random forest has .feature_importances_ instead of .coef_ """
+        """xgboost has .feature_importances_ instead of .coef_ """
         return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?
 
 
@@ -153,3 +155,5 @@ def coefs(self):
 
         pipeline = XGBoostPipeline(ds=ds, y_col=y_col)
         pipeline.perform()
+
+        #breakpoint()
@@ -0,0 +1,115 @@
+import os
+import joblib
+from functools import cached_property
+
+from google.cloud import storage as gcs
+from dotenv import load_dotenv
+
+load_dotenv()
+
+GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud for env var
+
+#PROJECT_ID = os.getenv("GOOGLE_PROJECT_NAME") # "my-project"
+BUCKET_NAME = os.getenv("BUCKET_NAME") # "my-bucket" needs to be globally unique!
+
+
+class StorageService:
+    def __init__(self, bucket_name=BUCKET_NAME): # project_id=PROJECT_ID
+        #self.project_id = project_id
+        self.bucket_name = bucket_name
+        print("-----------------------")
+        print("CLOUD STORAGE SERVICE...")
+        #print("PROJECT ID:", self.project_id)
+        print("BUCKET NAME:", self.bucket_name)
+
+
+    @property
+    def client(self):
+        return gcs.Client() # project=self.project_id
+
+    @cached_property
+    def buckets(self):
+        return list(self.client.list_buckets())
+
+    def find_or_create_bucket(self):
+        for bucket in self.buckets:
+            if bucket.name == self.bucket_name:
+                print("USING EXISTING BUCKET...")
+                return bucket
+
+        print(f"CREATING BUCKET...")
+        return self.client.create_bucket(self.bucket_name)
+
+    @cached_property
+    def bucket(self):
+        return self.find_or_create_bucket()
+
+    def list_model_blobs(self):
+        # https://cloud.google.com/storage/docs/json_api/v1/objects/list#list-object-glob
+        return self.client.list_blobs(self.bucket_name, match_glob="**/model.joblib")
+
+
+
+
+
+class ModelStorage(StorageService):
+
+    def __init__(self, local_dirpath:str, bucket_name=BUCKET_NAME, storage_dirpath=None): # project_id=PROJECT_ID
+        """ Params local_dirpath, assumed to be somewhere in the results dir"""
+        super().__init__(bucket_name=bucket_name) # project_id=project_id
+
+        self.local_dirpath = local_dirpath
+        print("RESULTS DIR:", self.local_dirpath)
+
+        self.storage_dirpath = storage_dirpath or self.local_dirpath.split("..")[-1] #> "/results/onwards/" # TODO: this leaves an initial slash, which may create a redundant "/" directory on cloud storage, so consider removing initial slash if possible in the future (oops already saved all the models there :-D)
+        print("STORAGE DIR:", self.storage_dirpath)
+
+        self.model_filename = "model.joblib" # needs to be called 'model.joblib' specifically, for hosting from cloud storage on Google Vertex AI
+        self.local_model_filepath = os.path.join(self.local_dirpath, self.model_filename)
+        self.hosted_model_filepath =  os.path.join(self.storage_dirpath, self.model_filename)
+
+
+    @property
+    def model_blob(self):
+        return self.bucket.blob(self.hosted_model_filepath)
+
+    def save_model(self, model):
+        print("SAVING MODEL (LOCAL)...")
+        os.makedirs(self.local_dirpath, exist_ok=True)
+        joblib.dump(model, self.local_model_filepath)
+
+    def upload_model_from_file(self):
+        print("UPLOADING MODEL...")
+        self.model_blob.upload_from_filename(self.local_model_filepath)
+
+    def save_and_upload_model(self, model):
+        self.save_model(model)
+        self.upload_model_from_file()
+
+    def download_model(self):
+        print("DOWNLOADING MODEL...")
+        with self.model_blob.open(mode="rb") as file:
+            return joblib.load(file)
+
+
+
+
+if __name__ == "__main__":
+
+
+    storage = StorageService()
+
+    print("---------------------")
+    for bucket in storage.buckets:
+        print(bucket)
+
+    print("---------------------")
+    print(storage.bucket)
+
+    print("---------------------")
+
+    blobs = list(storage.bucket.list_blobs())
+    for blob in blobs:
+        print("...", blob)
+
+    breakpoint()
@@ -1,5 +1,5 @@
 
-
+python-dotenv
 
 # data processing:
 pandas
@@ -24,5 +24,9 @@ umap-learn
 kmodes
 hdbscan
 
+# model storage:
+google-cloud-storage
+
+
 # automated tests:
 pytest