Skip to content

Commit 594495f

Browse files
authored
Model Storage (#19)
Saves trained models to Cloud Storage (so we can load them easily from Colab notebooks and potentially also host via Vertex AI endpoint later). Changes hyperparameter search space and updates all classification results files. * Model storage * De-couple service class responsibilities * Prepare for results * Logistic Results * XGBoost search * XGBoost results * Random Forest results * Add setup instructions to README * Test storage service * Project inferred from credentials file * List hosted models
1 parent 3c6505d commit 594495f

File tree

129 files changed

+25164
-24434
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

129 files changed

+25164
-24434
lines changed

.gitignore

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,13 @@ data/*/*.csv.gz
1212
results/*/*.csv
1313
results/*/*.csv.gz
1414
results/*/*.json
15+
results/*/*.json
1516

1617
#results/*/*/*.png
1718
#results/*/*/*.html
1819
results/*/*/*.json
1920

20-
#results/*/*/*/*.png
21-
#results/*/*/*/*.html
22-
#results/*/*/*/*.json
21+
results/classification/*/*/model.joblib
2322

2423
# ignore these files b/c they contains user ids:
2524
results/classification/*/*/predictions.csv

README.md

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
# openai-embeddings-2023
22

3-
Dimensionality Reduction on Twitter Data using OpenAI Embeddings
3+
OpenAI Text Embeddings for User Classification in Social Networks
44

5-
## Research Questions
6-
7-
Can we use ChatGPT's embeddings to reproduce our previous research?
8-
9-
Can ChatGPT discern bot status, political sentiment, and q-anon support, based on user profiles and tweets?
5+
+ [Results Website](https://s2t2.github.io/openai-embeddings-2023/index.html)
6+
+ [Conference Talk (INFORMS 2023)](https://www.youtube.com/watch?v=AmF-5D4p1_4)
107

118

129
## Setup
1310

11+
### Virtual Environment
12+
1413
Create and/or activate virtual environment:
1514

1615
```sh
@@ -24,8 +23,28 @@ Install package dependencies:
2423
pip install -r requirements.txt
2524
```
2625

26+
### Users Sample
27+
2728
Obtain a copy of the "botometer_sample_openai_tweet_embeddings_20230724.csv.gz" CSV file, and store it in the "data/text-embedding-ada-002" directory in this repo. This file was generated by the [notebooks](/notebooks/README.md), and is ignored from version control because it contains user identifiers.
2829

30+
### Cloud Storage
31+
32+
We are saving trained models to Google Cloud Storage. You will need to create a project on Google Cloud, and enable the Cloud Storage API as necessary. Then create a service account and download the service account JSON credentials file, and store it in the root directory, called "google-credentials.json". This file has been ignored from version control.
33+
34+
From the cloud storage console, create a new bucket, and note its name (i.e. `BUCKET_NAME`).
35+
36+
### Environment Variables
37+
38+
Create a local ".env" file and add contents like the following:
39+
40+
```sh
41+
# this is the ".env" file...
42+
43+
GOOGLE_APPLICATION_CREDENTIALS="/path/to/openai-embeddings-2023/google-credentials.json"
44+
BUCKET_NAME="my-bucket"
45+
46+
```
47+
2948
## Usage
3049

3150
### Dataset Loading

app/classification/logistic_regression.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,22 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
2222

2323
# C (float), default=1.0
2424
# Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
25-
#"classifier__C": [0.5, 1, 2, 10, 100],
25+
"classifier__C": [
26+
#0.1,
27+
#0.5,
28+
1, 2, 5,
29+
10, 25, 50,
30+
100
31+
],
2632

2733
# default max_iter is 100
28-
"classifier__max_iter": [#5, #15,
29-
#20,
30-
25, #30, #35,
31-
#50,
32-
#100, #250,
33-
#500,
34-
1_000,
35-
10_000
36-
],
34+
"classifier__max_iter": [10, 25,
35+
50,
36+
100,
37+
250,
38+
500, 1_000, 5_000, 10_000
39+
],
40+
3741
#"classifier__solver": ["liblinear", "newton-cg", "lbfgs", "sag", "saga"],
3842
}
3943

app/classification/pipeline.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,28 @@
66

77
import numpy as np
88
from pandas import Series, DataFrame
9-
from sklearn.model_selection import GridSearchCV, train_test_split
10-
from sklearn.pipeline import Pipeline
11-
#import matplotlib.pyplot as plt
129
import plotly.express as px
1310
import plotly.graph_objs as go
1411

12+
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
13+
from sklearn.model_selection import GridSearchCV, train_test_split
14+
from sklearn.pipeline import Pipeline
15+
from sklearn.metrics import roc_curve, auc
16+
17+
from app.colors import ORANGES
1518
from app.dataset import Dataset
19+
from app.model_storage import ModelStorage
1620
from app.classification import CLASSIFICATION_RESULTS_DIRPATH, save_results_json, class_labels
1721
from app.classification.results import ClassificationResults
1822

23+
1924
K_FOLDS = int(os.getenv("K_FOLDS", default="5"))
2025
#X_SCALE = bool(os.getenv("X_SCALE", default="false").lower() == "true")
2126
#SCALER_TYPE = os.getenv("SCALER_TYPE")
2227

2328
FIG_SHOW = bool(os.getenv("FIG_SHOW", default="false").lower() == "true")
2429
FIG_SAVE = bool(os.getenv("FIG_SAVE", default="true").lower() == "true")
2530

26-
from sklearn.metrics import roc_curve, roc_auc_score, auc
27-
import matplotlib.pyplot as plt
28-
from sklearn.preprocessing import label_binarize, LabelBinarizer, LabelEncoder
29-
from app.colors import ORANGES
3031

3132

3233
class ClassificationPipeline(ABC):
@@ -72,6 +73,7 @@ def __init__(self, ds=None, x_scale=False, y_col="is_bot", param_grid=None, k_fo
7273
self.gs = None
7374
self.results = None
7475
self.results_json = {}
76+
self.storage = None
7577

7678
# set in child class:
7779
self.model = None
@@ -94,7 +96,9 @@ def perform(self):
9496
else:
9597
self.plot_roc_curve()
9698

97-
#self.save_and_upload_model()
99+
# upload to cloud storage :-D
100+
self.storage = ModelStorage(local_dirpath=self.results_dirpath)
101+
self.storage.save_and_upload_model(self.model)
98102

99103

100104
def train_eval(self):

app/classification/random_forest.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
1616
self.param_grid = param_grid or {
1717

1818
# default=100
19-
"classifier__n_estimators": [50, 100, 150, 250],
19+
"classifier__n_estimators": [50,
20+
100, 150, 250, 500
21+
],
2022

2123
# criterion {"gini", "entropy", "log_loss"}, default="gini"
2224
# ... The function to measure the quality of a split.
2325
# ... "gini" for Gini impurity, "log_loss" / "entropy" for Shannon information gain
24-
"classifier__criterion": ["gini", "log_loss"],
26+
#"classifier__criterion": ["gini", "log_loss"],
2527

2628
# min_samples_split (int or float), default=2
2729
#The minimum number (or percentage) of samples required to split an internal node
@@ -31,12 +33,12 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
3133
#... The minimum number of samples required to be at a leaf node.
3234
# ... A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches.
3335
# ... This may have the effect of smoothing the model, especially in regression.
34-
"classifier__min_samples_leaf": (1, 5,
35-
10, 25,
36-
50, #75,
37-
#90,
38-
100, #110, #125, 150
39-
),
36+
#"classifier__min_samples_leaf": (1, 5,
37+
# 10, 25,
38+
# 50, #75,
39+
# #90,
40+
# 100, #110, #125, 150
41+
# ),
4042

4143
# max_depth (int), default=None
4244
# ... The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
@@ -78,9 +80,7 @@ def coefs(self):
7880

7981
for y_col in Y_COLS:
8082

81-
pipeline = RandomForestPipeline(ds=ds, y_col=y_col, param_grid={
82-
"classifier__criterion": ["gini"],
83-
"classifier__min_samples_leaf": [3, 5, 8],
84-
"classifier__n_estimators": [250, 500, 1000],
85-
})
83+
pipeline = RandomForestPipeline(ds=ds, y_col=y_col)
8684
pipeline.perform()
85+
86+
#breakpoint()

app/classification/xgboost.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,10 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
3535
self.param_grid = param_grid or {
3636

3737
# n_estimators (Optional[int]) – Number of boosting rounds.
38-
38+
'classifier__n_estimators': [ #50,
39+
100, 150, 250, 500],
3940

4041
# max_depth (Optional[int]) – Maximum tree depth for base learners.
41-
#"max_depth": [2, 4, 8, 16],
4242

4343
# max_leaves – Maximum number of leaves; 0 indicates no limit.
4444
#"max_leaves": [0, 2, 4, 8, 16]
@@ -48,6 +48,8 @@ def __init__(self, ds=None, y_col="is_bot", param_grid=None, results_dirpath=Non
4848
# grow_policy – Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow depth-wise. 1: favor splitting at nodes with highest loss change.
4949
#
5050
# learning_rate (Optional[float]) – Boosting learning rate (xgb’s “eta”)
51+
#'classifier__learning_rate': [0.1, 0.2, 0.3, 0.5],
52+
5153
#
5254
# verbosity (Optional[int]) – The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
5355
#
@@ -133,7 +135,7 @@ def explainability_json(self):
133135

134136
@property
135137
def coefs(self):
136-
"""random forest has .feature_importances_ instead of .coef_ """
138+
"""xgboost has .feature_importances_ instead of .coef_ """
137139
return Series(self.model.feature_importances_, index=self.model.feature_names_in_) #.sort_values(ascending=False) # don't sort? preserve order with features?
138140

139141

@@ -153,3 +155,5 @@ def coefs(self):
153155

154156
pipeline = XGBoostPipeline(ds=ds, y_col=y_col)
155157
pipeline.perform()
158+
159+
#breakpoint()

app/model_storage.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import os
2+
import joblib
3+
from functools import cached_property
4+
5+
from google.cloud import storage as gcs
6+
from dotenv import load_dotenv
7+
8+
load_dotenv()
9+
10+
GOOGLE_APPLICATION_CREDENTIALS = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") # implicit check by google.cloud for env var
11+
12+
#PROJECT_ID = os.getenv("GOOGLE_PROJECT_NAME") # "my-project"
13+
BUCKET_NAME = os.getenv("BUCKET_NAME") # "my-bucket" needs to be globally unique!
14+
15+
16+
class StorageService:
17+
def __init__(self, bucket_name=BUCKET_NAME): # project_id=PROJECT_ID
18+
#self.project_id = project_id
19+
self.bucket_name = bucket_name
20+
print("-----------------------")
21+
print("CLOUD STORAGE SERVICE...")
22+
#print("PROJECT ID:", self.project_id)
23+
print("BUCKET NAME:", self.bucket_name)
24+
25+
26+
@property
27+
def client(self):
28+
return gcs.Client() # project=self.project_id
29+
30+
@cached_property
31+
def buckets(self):
32+
return list(self.client.list_buckets())
33+
34+
def find_or_create_bucket(self):
35+
for bucket in self.buckets:
36+
if bucket.name == self.bucket_name:
37+
print("USING EXISTING BUCKET...")
38+
return bucket
39+
40+
print(f"CREATING BUCKET...")
41+
return self.client.create_bucket(self.bucket_name)
42+
43+
@cached_property
44+
def bucket(self):
45+
return self.find_or_create_bucket()
46+
47+
def list_model_blobs(self):
48+
# https://cloud.google.com/storage/docs/json_api/v1/objects/list#list-object-glob
49+
return self.client.list_blobs(self.bucket_name, match_glob="**/model.joblib")
50+
51+
52+
53+
54+
55+
class ModelStorage(StorageService):
56+
57+
def __init__(self, local_dirpath:str, bucket_name=BUCKET_NAME, storage_dirpath=None): # project_id=PROJECT_ID
58+
""" Params local_dirpath, assumed to be somewhere in the results dir"""
59+
super().__init__(bucket_name=bucket_name) # project_id=project_id
60+
61+
self.local_dirpath = local_dirpath
62+
print("RESULTS DIR:", self.local_dirpath)
63+
64+
self.storage_dirpath = storage_dirpath or self.local_dirpath.split("..")[-1] #> "/results/onwards/" # TODO: this leaves an initial slash, which may create a redundant "/" directory on cloud storage, so consider removing initial slash if possible in the future (oops already saved all the models there :-D)
65+
print("STORAGE DIR:", self.storage_dirpath)
66+
67+
self.model_filename = "model.joblib" # needs to be called 'model.joblib' specifically, for hosting from cloud storage on Google Vertex AI
68+
self.local_model_filepath = os.path.join(self.local_dirpath, self.model_filename)
69+
self.hosted_model_filepath = os.path.join(self.storage_dirpath, self.model_filename)
70+
71+
72+
@property
73+
def model_blob(self):
74+
return self.bucket.blob(self.hosted_model_filepath)
75+
76+
def save_model(self, model):
77+
print("SAVING MODEL (LOCAL)...")
78+
os.makedirs(self.local_dirpath, exist_ok=True)
79+
joblib.dump(model, self.local_model_filepath)
80+
81+
def upload_model_from_file(self):
82+
print("UPLOADING MODEL...")
83+
self.model_blob.upload_from_filename(self.local_model_filepath)
84+
85+
def save_and_upload_model(self, model):
86+
self.save_model(model)
87+
self.upload_model_from_file()
88+
89+
def download_model(self):
90+
print("DOWNLOADING MODEL...")
91+
with self.model_blob.open(mode="rb") as file:
92+
return joblib.load(file)
93+
94+
95+
96+
97+
if __name__ == "__main__":
98+
99+
100+
storage = StorageService()
101+
102+
print("---------------------")
103+
for bucket in storage.buckets:
104+
print(bucket)
105+
106+
print("---------------------")
107+
print(storage.bucket)
108+
109+
print("---------------------")
110+
111+
blobs = list(storage.bucket.list_blobs())
112+
for blob in blobs:
113+
print("...", blob)
114+
115+
breakpoint()

requirements.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
2+
python-dotenv
33

44
# data processing:
55
pandas
@@ -24,5 +24,9 @@ umap-learn
2424
kmodes
2525
hdbscan
2626

27+
# model storage:
28+
google-cloud-storage
29+
30+
2731
# automated tests:
2832
pytest

results/classification/fourway_label/logistic_regression/confusion.html

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.
1 Byte
Loading

0 commit comments

Comments
 (0)