Skip to content

Commit a164ea8

Browse files
authored
OpenAI Embeddings (#20)
Re-implement code for fetching text embeddings from OpenAI.
1 parent 594495f commit a164ea8

File tree

7 files changed

+348
-1
lines changed

7 files changed

+348
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ google-credentials-shared.json
88

99
data/*/*.csv
1010
data/*/*.csv.gz
11+
#!data/*/example_openai_embeddings.csv
1112

1213
results/*/*.csv
1314
results/*/*.csv.gz

README.md

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ Install package dependencies:
2323
pip install -r requirements.txt
2424
```
2525

26+
### OpenAI API
27+
28+
Obtain an OpenAI API Key (i.e. `OPENAI_API_KEY`). We initially fetched embeddings from the OpenAI API via the [notebooks](/notebooks/README.md), but the service code has been re-implemented here afterwards, in case you want to experiment with obtaining your own embeddings.
29+
2630
### Users Sample
2731

2832
Obtain a copy of the "botometer_sample_openai_tweet_embeddings_20230724.csv.gz" CSV file, and store it in the "data/text-embedding-ada-002" directory in this repo. This file was generated by the [notebooks](/notebooks/README.md), and is ignored from version control because it contains user identifiers.
@@ -33,20 +37,31 @@ We are saving trained models to Google Cloud Storage. You will need to create a
3337

3438
From the cloud storage console, create a new bucket, and note its name (i.e. `BUCKET_NAME`).
3539

40+
3641
### Environment Variables
3742

3843
Create a local ".env" file and add contents like the following:
3944

4045
```sh
4146
# this is the ".env" file...
4247

48+
OPENAI_API_KEY="sk__________"
49+
4350
GOOGLE_APPLICATION_CREDENTIALS="/path/to/openai-embeddings-2023/google-credentials.json"
4451
BUCKET_NAME="my-bucket"
45-
4652
```
4753

4854
## Usage
4955

56+
### OpenAI Embeddings
57+
58+
Fetch some example embeddings from OpenAI API:
59+
60+
```sh
61+
python -m app.openai_service
62+
```
63+
64+
5065
### Dataset Loading
5166

5267
Demonstrate ability to load the dataset:

app/openai_service.py

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
# https://github.com/s2t2/openai-embeddings-2023/blob/main/notebooks/1_botometer_users_sample_and_openai_embeddings_20230704.py
2+
3+
import os
4+
from time import sleep
5+
from pprint import pprint
6+
import json
7+
8+
import openai
9+
from openai import Model, Embedding
10+
from pandas import DataFrame
11+
from dotenv import load_dotenv
12+
13+
14+
load_dotenv()
15+
16+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
17+
MODEL_ID = os.getenv("OPENAI_EMBEDDING_MODEL_ID", default="text-embedding-ada-002")
18+
19+
openai.api_key = OPENAI_API_KEY
20+
21+
22+
23+
def split_into_batches(my_list, batch_size=10_000):
24+
"""Splits a list into evenly sized batches"""
25+
# h/t: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
26+
for i in range(0, len(my_list), batch_size):
27+
yield my_list[i : i + batch_size]
28+
29+
def dynamic_batches(texts, batch_char_limit=30_000):
30+
"""Splits texts into batches, with specified max number of characters per batch.
31+
Caps text length at the maximum batch size (individual text cannot exceed batch size).
32+
Batches may have different lengths.
33+
"""
34+
batches = []
35+
36+
batch = []
37+
batch_chars = 0
38+
for text in texts:
39+
text_chars = len(text)
40+
41+
if (batch_chars + text_chars) <= batch_char_limit:
42+
# THERE IS ROOM TO ADD THIS TEXT TO THE BATCH
43+
batch.append(text)
44+
batch_chars += text_chars
45+
else:
46+
# NO ROOM IN THIS BATCH, START A NEW ONE:
47+
48+
if text_chars > batch_char_limit:
49+
# CAP THE TEXT AT THE MAX BATCH LENGTH
50+
text = text[0:batch_char_limit-1]
51+
52+
batches.append(batch)
53+
batch = [text]
54+
batch_chars = text_chars
55+
56+
if batch:
57+
batches.append(batch)
58+
59+
return batches
60+
61+
62+
63+
class OpenAIService():
64+
"""OpenAI API Service
65+
66+
+ https://github.com/openai/openai-python
67+
+ https://platform.openai.com/account/api-keys
68+
+ https://platform.openai.com/docs/introduction/key-concepts
69+
+ https://platform.openai.com/docs/models/overview
70+
+ https://platform.openai.com/docs/guides/embeddings/what-are-embeddings
71+
+ https://platform.openai.com/docs/guides/embeddings/embedding-models
72+
73+
> We recommend using `text-embedding-ada-002` for nearly all
74+
(Embedding) use cases. It's better, cheaper, and simpler to use.
75+
"""
76+
77+
def __init__(self, model_id=MODEL_ID):
78+
self.model_id = model_id
79+
print("EMBEDDING MODEL:", self.model_id)
80+
81+
82+
def get_models(self):
83+
models = Model.list()
84+
#print(type(models)) #> openai.openai_object.OpenAIObject
85+
86+
records = []
87+
for model in sorted(models.data, key=lambda m: m.id):
88+
#print(model.id, "...", model.owned_by, "...", model.parent, "...", model.object)
89+
model_info = model.to_dict()
90+
del model_info["permission"] # nested list
91+
#print(model_info)
92+
records.append(model_info)
93+
94+
models_df = DataFrame(records)
95+
#models_df.to_csv("openai_models.csv")
96+
#models_df.sort_values(by=["id"])
97+
return models_df
98+
99+
def get_embeddings(self, texts):
100+
"""Pass in a list of strings. Returns a list of embeddings for each."""
101+
result = Embedding.create(input=texts, model=self.model_id) # API CALL
102+
#print(len(result["data"]))
103+
return [d["embedding"] for d in result["data"]]
104+
105+
def get_embeddings_in_batches(self, texts, batch_size=250, sleep_seconds=60):
106+
"""High level wrapper to work around RateLimitError:
107+
Rate limit reached for [MODEL] in [ORG] on tokens per min.
108+
Limit: 1_000_000 tokens / min.
109+
110+
batch_size : Number of users to request per API call
111+
112+
sleep : Wait for a minute before requesting the next batch
113+
114+
Also beware InvalidRequestError:
115+
This model's maximum context length is 8191 tokens,
116+
however you requested X tokens (X in your prompt; 0 for the completion).
117+
Please reduce your prompt; or completion length.
118+
119+
... so we should make lots of smaller requests.
120+
"""
121+
#embeddings = []
122+
#counter = 1
123+
#for texts_batch in split_into_batches(texts, batch_size=batch_size):
124+
# print(counter, len(texts_batch))
125+
# embeds_batch = self.get_embeddings(texts_batch) # API CALL
126+
# embeddings += embeds_batch
127+
# counter += 1
128+
# sleep(sleep_seconds)
129+
#return embeddings
130+
131+
#embeddings = []
132+
#counter = 1
133+
#for texts_batch in split_into_batches(texts, batch_size=batch_size):
134+
# print(counter, len(texts_batch))
135+
# try:
136+
# embeds_batch = self.get_embeddings(texts_batch) # API CALL
137+
# embeddings += embeds_batch
138+
# except openai.error.RateLimitError as err:
139+
# print(f"Rate limit reached. Sleeping for {sleep_seconds} seconds.")
140+
# sleep(sleep_seconds)
141+
# continue
142+
# counter += 1
143+
#return embeddings
144+
145+
embeddings = []
146+
counter = 1
147+
for texts_batch in split_into_batches(texts, batch_size=batch_size):
148+
print(counter, len(texts_batch))
149+
# retry loop
150+
while True:
151+
try:
152+
embeds_batch = self.get_embeddings(texts_batch) # API CALL
153+
embeddings += embeds_batch
154+
break # exit the retry loop and go to the next batch
155+
except openai.error.RateLimitError as err:
156+
print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
157+
sleep(sleep_seconds)
158+
# retry the same batch
159+
#except openai.error.InvalidRequestError as err:
160+
# print("INVALID REQUEST", err)
161+
counter += 1
162+
return embeddings
163+
164+
def get_embeddings_in_dynamic_batches(self, texts, batch_char_limit=30_000, sleep_seconds=60):
165+
"""High level wrapper to work around API limitations
166+
167+
RateLimitError:
168+
Rate limit reached for [MODEL] in [ORG] on tokens per min.
169+
Limit: 1_000_000 tokens / min.
170+
171+
AND
172+
173+
InvalidRequestError:
174+
This model's maximum context length is 8191 tokens,
175+
however you requested X tokens (X in your prompt; 0 for the completion).
176+
Please reduce your prompt; or completion length.
177+
178+
Params:
179+
180+
batch_char_limit : Number of max characters to request per API call.
181+
Should be less than around 32_000 based on API docs.
182+
183+
sleep : Wait for a minute before requesting the next batch
184+
185+
"""
186+
embeddings = []
187+
counter = 1
188+
for texts_batch in dynamic_batches(texts, batch_char_limit=batch_char_limit):
189+
print(counter, len(texts_batch))
190+
# retry loop
191+
while True:
192+
try:
193+
embeds_batch = self.get_embeddings(texts_batch) # API CALL
194+
embeddings += embeds_batch
195+
break # exit the retry loop and go to the next batch
196+
except openai.error.RateLimitError as err:
197+
print(f"... Rate limit reached. Sleeping for {sleep_seconds} seconds.")
198+
sleep(sleep_seconds)
199+
# retry the same batch
200+
counter += 1
201+
return embeddings
202+
203+
204+
205+
206+
207+
208+
209+
210+
if __name__ == "__main__":
211+
212+
from app import DATA_DIRPATH
213+
214+
print("-----------------")
215+
print("TEXTS:")
216+
texts = [
217+
"Short and sweet",
218+
"Short short",
219+
"I like apples, but bananas are gross.",
220+
"This is a tweet about bananas",
221+
"Drink apple juice!",
222+
]
223+
pprint(texts)
224+
225+
#print("-----------------")
226+
#print("BATCHES:")
227+
#batches = list(split_into_batches(texts, batch_size=2))
228+
#pprint(batches)
229+
230+
#print("-----------------")
231+
#print("DYNAMIC BATCHES:")
232+
#batches = dynamic_batches(texts, batch_char_limit=30)
233+
#pprint(batches)
234+
235+
print("-----------------")
236+
print("EMBEDDINGS:")
237+
238+
ai = OpenAIService()
239+
240+
embeddings = ai.get_embeddings(texts)
241+
#embeddings = ai.get_embeddings_in_dynamic_batches(texts, batch_char_limit=15_000)
242+
#print(type(embeddings), len(embeddings))
243+
#print(len(embeddings[0])) #> 1536
244+
245+
df = DataFrame({"text": texts, "openai_embeddings": embeddings})
246+
print(df)
247+
248+
print("-----------------")
249+
# UNpacK EMBEdDINGS tO THEIR OWN COLUMNS
250+
embeds_df = DataFrame(df["openai_embeddings"].values.tolist())
251+
embeds_df.columns = [str(i) for i in range(0, len(embeds_df.columns))]
252+
embeds_df = df.drop(columns=["openai_embeddings"]).merge(embeds_df, left_index=True, right_index=True)
253+
print(embeds_df)
254+
255+
print("-----------------")
256+
print("SAVING...")
257+
258+
model_dirpath = os.path.join(DATA_DIRPATH, ai.model_id)
259+
os.makedirs(model_dirpath, exist_ok=True)
260+
261+
embeddings_csv_filepath = os.path.join(model_dirpath, "example_openai_embeddings.csv")
262+
#embeddings_json_filepath = os.path.join(model_dirpath, "example_openai_embeddings.json")
263+
264+
embeds_df.to_csv(embeddings_csv_filepath)
265+
#df.to_json(embeddings_json_filepath)

data/text-embedding-ada-002/example_openai_embeddings.csv

Lines changed: 6 additions & 0 deletions
Large diffs are not rendered by default.

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ matplotlib
1111
plotly
1212
kaleido # for exporting plotly images to png / html
1313

14+
# embeddings:
15+
openai==0.28 # ok so there is now a 1.0 interface but we originally obtained using earlier API, so pinning that here
16+
1417

1518
# machine learning:
1619
scikit-learn

test/data/text-embedding-ada-002/example_openai_embeddings.csv

Lines changed: 6 additions & 0 deletions
Large diffs are not rendered by default.

test/openai_service_test.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
2+
import os
3+
4+
from pytest import fixture
5+
from pandas import read_csv
6+
7+
from app import DATA_DIRPATH
8+
from app.openai_service import split_into_batches, dynamic_batches
9+
10+
11+
def test_batchmakers():
12+
13+
texts = [
14+
"Short and sweet",
15+
"Short short",
16+
"I like apples, but bananas are gross.",
17+
"This is a tweet about bananas",
18+
"Drink apple juice!",
19+
]
20+
21+
batches = list(split_into_batches(texts, batch_size=2))
22+
assert batches == [
23+
['Short and sweet', 'Short short'], # 2
24+
['I like apples, but bananas are gross.', 'This is a tweet about bananas'], # 2
25+
['Drink apple juice!'] # remainder
26+
]
27+
28+
batches = dynamic_batches(texts, batch_char_limit=30)
29+
assert batches == [
30+
['Short and sweet', 'Short short'],
31+
['I like apples, but bananas ar'],
32+
['This is a tweet about bananas'],
33+
['Drink apple juice!']
34+
]
35+
36+
37+
38+
39+
#@fixture(scope="module")
40+
#def example_embeddings_df();
41+
42+
43+
def test_load_embeddings():
44+
45+
example_embeddings_csv_filepath = os.path.join(os.path.dirname(__file__), "data", "text-embedding-ada-002", "example_openai_embeddings.csv")
46+
print(os.path.isfile(example_embeddings_csv_filepath))
47+
embeds_df = read_csv(example_embeddings_csv_filepath)
48+
embeds_df.drop(columns=["Unnamed: 0"], inplace=True)
49+
50+
assert "text" in embeds_df.columns
51+
assert embeds_df.drop(columns=["text"]).shape == (5, 1536)

0 commit comments

Comments
 (0)