-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembeddings.py
148 lines (134 loc) · 5.35 KB
/
embeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import pandas as pd
from openai import OpenAI
client = OpenAI()
import tiktoken
import chromadb
from chromadb.utils import embedding_functions
chroma_client = chromadb.PersistentClient(path="data/chroma")
import pandas as pd
import numpy as np
from typing import Iterator
from ast import literal_eval
from tenacity import retry, wait_random_exponential, stop_after_attempt
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
EMBEDDINGS_MODEL = "text-embedding-3-small"
OPENAI_EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-3-small
MAX_EMBEDDING_TOKENS = 8191 # the maximum for text-embedding-3-small is 8191
EMBEDDINGS_INDEX_NAME = "book-notes"
BATCH_SIZE = 100
# Models a simple batch generator that make chunks out of an input DataFrame
class BatchGenerator:
def __init__(self, batch_size: int = 10) -> None:
self.batch_size = batch_size
# Makes chunks out of an input DataFrame
def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
splits = self.splits_num(df.shape[0])
if splits <= 1:
yield df
else:
for chunk in np.array_split(df, splits):
yield chunk
# Determines how many chunks DataFrame contains
def splits_num(self, elements: int) -> int:
return round(elements / self.batch_size)
__call__ = to_batches
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=EMBEDDINGS_MODEL) -> list[float]:
text = text.replace("\n", " ") # OpenAI says removing newlines leads to better performance
response = client.embeddings.create(
input=text,
model=model
)
return response.data[0].embedding
def get_embeddings(df: pd.DataFrame):
print('Getting embeddings...')
encoding = tiktoken.get_encoding(OPENAI_EMBEDDING_ENCODING)
# omit any that are too long to embed
df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= MAX_EMBEDDING_TOKENS]
df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=EMBEDDINGS_MODEL))
print('Done getting embeddings.')
return df
def compute_embeddings(df: pd.DataFrame):
return {
idx: get_embedding(r.combined) for idx, r in df.iterrows()
}
def load_embeddings(filepath: str = 'data/embeddings/book_notes_w_embeddings.csv'):
"""Load the dataset with the embeddings from a CSV file."""
df = pd.read_csv(filepath)
# Convert embeddings to list
df['embedding'] = df.embedding.apply(literal_eval)
# Convert id to string
df['id'] = df['id'].apply(str)
return df
def load_dataset_for_embeddings(df: pd.DataFrame):
"""Configure the dataset for embeddings."""
try:
# Keep only the columns we need
df = df[['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type']]
df['combined'] = (
"Title: " + df['book'].str.strip().fillna('') + "; " +
"Author: " + df['author'].str.strip().fillna('') + "; " +
"Highlight: " + df['highlight'].str.strip().fillna('') +
(("; Note: " + df['note'].str.strip()) if df['note'].notna().all() else '')
)
# Convert id to string
df['id'] = df['id'].apply(str)
return df
except:
print("Error configuring dataset for embeddings.")
return df
def save_embeddings(df: pd.DataFrame, output_path: str = 'data/embeddings/book_notes_w_embeddings.csv'):
"""Save the dataset with the embeddings to a CSV file."""
if os.path.exists(output_path):
# Read in the existing file
existing_df = pd.read_csv(output_path)
# Append the new data to the existing data
df = pd.concat([existing_df, df], ignore_index=True)
df.to_csv(f'{output_path}', index=False)
print(f"Saved embeddings to {output_path}.")
else:
df.to_csv(f'{output_path}', index=False)
print(f"Saved embeddings to {output_path}.")
# Using chromadb for embeddings search
def add_embeddings_to_chroma(df: pd.DataFrame):
print(f'Adding {len(df)} embeddings to chromadb...')
ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=OPENAI_API_KEY,
model_name=EMBEDDINGS_MODEL
)
collection = chroma_client.get_or_create_collection(
name=EMBEDDINGS_INDEX_NAME,
embedding_function=ef
)
# Create a batch generator
df_batcher = BatchGenerator(BATCH_SIZE)
for batch_df in df_batcher(df):
collection.add(
embeddings=batch_df['embedding'].tolist(),
documents=batch_df['combined'].tolist(),
ids=batch_df['id'].tolist()
)
print('Done adding to chromadb.')
def query_embeddings_chroma(query: str, n_results: int = 5):
query_embedding = get_embedding(query)
ef = embedding_functions.OpenAIEmbeddingFunction(
api_key=OPENAI_API_KEY,
model_name=EMBEDDINGS_MODEL
)
collection = chroma_client.get_collection(
name=EMBEDDINGS_INDEX_NAME,
embedding_function=ef
)
results = collection.query(
query_embeddings=[query_embedding],
n_results=n_results
)
ids = results["ids"][0]
distances = results["distances"][0]
relevant_docs = [(distances[idx], id) for idx, id in enumerate(ids)]
relevant_docs = sorted(relevant_docs, reverse=True)
return relevant_docs