Skip to content

Upgrade to text-embedding-3-large model as default, with vector storage optimizations #2470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .azdo/pipelines/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ steps:
AZURE_SEARCH_QUERY_SPELLER: $(AZURE_SEARCH_QUERY_SPELLER)
AZURE_SEARCH_SEMANTIC_RANKER: $(AZURE_SEARCH_SEMANTIC_RANKER)
AZURE_SEARCH_QUERY_REWRITING: $(AZURE_SEARCH_QUERY_REWRITING)
AZURE_SEARCH_FIELD_NAME_EMBEDDING: $(AZURE_SEARCH_FIELD_NAME_EMBEDDING)
AZURE_STORAGE_ACCOUNT: $(AZURE_STORAGE_ACCOUNT)
AZURE_STORAGE_RESOURCE_GROUP: $(AZURE_STORAGE_RESOURCE_GROUP)
AZURE_STORAGE_SKU: $(AZURE_STORAGE_SKU)
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/azure-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ jobs:
AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }}
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
AZURE_SEARCH_QUERY_REWRITING: ${{ vars.AZURE_SEARCH_QUERY_REWRITING }}
AZURE_SEARCH_FIELD_NAME_EMBEDDING: ${{ vars.AZURE_SEARCH_FIELD_NAME_EMBEDDING }}
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}
Expand Down
11 changes: 10 additions & 1 deletion app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,8 @@ async def setup_clients():
AZURE_SEARCH_QUERY_SPELLER = os.getenv("AZURE_SEARCH_QUERY_SPELLER") or "lexicon"
AZURE_SEARCH_SEMANTIC_RANKER = os.getenv("AZURE_SEARCH_SEMANTIC_RANKER", "free").lower()
AZURE_SEARCH_QUERY_REWRITING = os.getenv("AZURE_SEARCH_QUERY_REWRITING", "false").lower()
# This defaults to the previous field name "embedding", for backwards compatibility
AZURE_SEARCH_FIELD_NAME_EMBEDDING = os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding")

AZURE_SPEECH_SERVICE_ID = os.getenv("AZURE_SPEECH_SERVICE_ID")
AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION")
Expand Down Expand Up @@ -580,7 +582,10 @@ async def setup_clients():
disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false",
)
ingester = UploadUserFileStrategy(
search_info=search_info, embeddings=text_embeddings_service, file_processors=file_processors
search_info=search_info,
embeddings=text_embeddings_service,
file_processors=file_processors,
search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
)
current_app.config[CONFIG_INGESTER] = ingester

Expand Down Expand Up @@ -677,6 +682,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -695,6 +701,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down Expand Up @@ -734,6 +741,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand All @@ -755,6 +763,7 @@ async def setup_clients():
embedding_model=OPENAI_EMB_MODEL,
embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT,
embedding_dimensions=OPENAI_EMB_DIMENSIONS,
embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING,
sourcepage_field=KB_FIELDS_SOURCEPAGE,
content_field=KB_FIELDS_CONTENT,
query_language=AZURE_SEARCH_QUERY_LANGUAGE,
Expand Down
29 changes: 8 additions & 21 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@
class Document:
id: Optional[str]
content: Optional[str]
embedding: Optional[list[float]]
image_embedding: Optional[list[float]]
category: Optional[str]
sourcepage: Optional[str]
sourcefile: Optional[str]
Expand All @@ -50,11 +48,9 @@ class Document:
reranker_score: Optional[float] = None

def serialize_for_results(self) -> dict[str, Any]:
return {
result_dict = {
"id": self.id,
"content": self.content,
"embedding": Document.trim_embedding(self.embedding),
"imageEmbedding": Document.trim_embedding(self.image_embedding),
"category": self.category,
"sourcepage": self.sourcepage,
"sourcefile": self.sourcefile,
Expand All @@ -75,18 +71,7 @@ def serialize_for_results(self) -> dict[str, Any]:
"score": self.score,
"reranker_score": self.reranker_score,
}

@classmethod
def trim_embedding(cls, embedding: Optional[list[float]]) -> Optional[str]:
"""Returns a trimmed list of floats from the vector embedding."""
if embedding:
if len(embedding) > 2:
# Format the embedding list to show the first 2 items followed by the count of the remaining items."""
return f"[{embedding[0]}, {embedding[1]} ...+{len(embedding) - 2} more]"
else:
return str(embedding)

return None
return result_dict


@dataclass
Expand Down Expand Up @@ -159,6 +144,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
openai_host: str,
vision_endpoint: str,
vision_token_provider: Callable[[], Awaitable[str]],
Expand All @@ -173,6 +159,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.openai_host = openai_host
self.vision_endpoint = vision_endpoint
self.vision_token_provider = vision_token_provider
Expand Down Expand Up @@ -238,8 +225,6 @@ async def search(
Document(
id=document.get("id"),
content=document.get("content"),
embedding=document.get("embedding"),
image_embedding=document.get("imageEmbedding"),
category=document.get("category"),
sourcepage=document.get("sourcepage"),
sourcefile=document.get("sourcefile"),
Expand Down Expand Up @@ -314,12 +299,14 @@ class ExtraArgs(TypedDict, total=False):
**dimensions_args,
)
query_vector = embedding.data[0].embedding
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields="embedding")
# This performs an oversampling due to how the search index was setup,
# so we do not need to explicitly pass in an oversampling parameter here
return VectorizedQuery(vector=query_vector, k_nearest_neighbors=50, fields=self.embedding_field)

async def compute_image_embedding(self, q: str):
endpoint = urljoin(self.vision_endpoint, "computervision/retrieval:vectorizeText")
headers = {"Content-Type": "application/json"}
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
data = {"text": q}

headers["Authorization"] = "Bearer " + await self.vision_token_provider()
Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -50,6 +51,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
15 changes: 7 additions & 8 deletions app/backend/approaches/chatreadretrievereadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -58,6 +59,7 @@ def __init__(
self.embedding_deployment = embedding_deployment
self.embedding_model = embedding_model
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down Expand Up @@ -89,7 +91,7 @@ async def run_until_final_call(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

Expand Down Expand Up @@ -122,13 +124,10 @@ async def run_until_final_call(
# If retrieval mode includes vectors, compute an embedding for the query
vectors = []
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(query_text)
if field == "embedding"
else await self.compute_image_embedding(query_text)
)
vectors.append(vector)
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
vectors.append(await self.compute_text_embedding(query_text))
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
vectors.append(await self.compute_image_embedding(query_text))

results = await self.search(
top,
Expand Down
2 changes: 2 additions & 0 deletions app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
embedding_model: str,
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -44,6 +45,7 @@ def __init__(
self.embedding_dimensions = embedding_dimensions
self.chatgpt_deployment = chatgpt_deployment
self.embedding_deployment = embedding_deployment
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.query_language = query_language
Expand Down
15 changes: 7 additions & 8 deletions app/backend/approaches/retrievethenreadvision.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text"
embedding_model: str,
embedding_dimensions: int,
embedding_field: str,
sourcepage_field: str,
content_field: str,
query_language: str,
Expand All @@ -48,6 +49,7 @@ def __init__(
self.embedding_model = embedding_model
self.embedding_deployment = embedding_deployment
self.embedding_dimensions = embedding_dimensions
self.embedding_field = embedding_field
self.sourcepage_field = sourcepage_field
self.content_field = content_field
self.gpt4v_deployment = gpt4v_deployment
Expand Down Expand Up @@ -84,20 +86,17 @@ async def run(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)

vector_fields = overrides.get("vector_fields", ["embedding"])
vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings")
send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None]
send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None]

# If retrieval mode includes vectors, compute an embedding for the query
vectors = []
if use_vector_search:
for field in vector_fields:
vector = (
await self.compute_text_embedding(q)
if field == "embedding"
else await self.compute_image_embedding(q)
)
vectors.append(vector)
if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
vectors.append(await self.compute_text_embedding(q))
if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings":
vectors.append(await self.compute_image_embedding(q))

results = await self.search(
top,
Expand Down
3 changes: 3 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
blob_manager=blob_manager,
document_action=document_action,
embeddings=openai_embeddings_service,
search_field_name_embedding=os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"],
subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"],
search_service_user_assigned_id=args.searchserviceassignedid,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
Expand Down Expand Up @@ -430,6 +431,8 @@ async def main(strategy: Strategy, setup_index: bool = True):
embeddings=openai_embeddings_service,
image_embeddings=image_embeddings_service,
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
# Default to the previous field names for backward compatibility
search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"),
use_acls=use_acls,
category=args.category,
use_content_understanding=use_content_understanding,
Expand Down
2 changes: 1 addition & 1 deletion app/backend/prepdocslib/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]):
async def create_embeddings(self, blob_urls: list[str]) -> list[list[float]]:
endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage")
headers = {"Content-Type": "application/json"}
params = {"api-version": "2023-02-01-preview", "modelVersion": "latest"}
params = {"api-version": "2024-02-01", "model-version": "2023-04-15"}
headers["Authorization"] = "Bearer " + await self.token_provider()

embeddings: list[list[float]] = []
Expand Down
34 changes: 24 additions & 10 deletions app/backend/prepdocslib/filestrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_analyzer_name: Optional[str] = None,
search_field_name_embedding: Optional[str] = None,
use_acls: bool = False,
category: Optional[str] = None,
use_content_understanding: bool = False,
Expand All @@ -63,22 +64,27 @@ def __init__(
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_analyzer_name = search_analyzer_name
self.search_field_name_embedding = search_field_name_embedding
self.search_info = search_info
self.use_acls = use_acls
self.category = category
self.use_content_understanding = use_content_understanding
self.content_understanding_endpoint = content_understanding_endpoint

async def setup(self):
search_manager = SearchManager(
def setup_search_manager(self):
self.search_manager = SearchManager(
self.search_info,
self.search_analyzer_name,
self.use_acls,
False,
self.embeddings,
field_name_embedding=self.search_field_name_embedding,
search_images=self.image_embeddings is not None,
)
await search_manager.create_index()

async def setup(self):
self.setup_search_manager()
await self.search_manager.create_index()

if self.use_content_understanding:
if self.content_understanding_endpoint is None:
Expand All @@ -91,9 +97,7 @@ async def setup(self):
await cu_manager.create_analyzer()

async def run(self):
search_manager = SearchManager(
self.search_info, self.search_analyzer_name, self.use_acls, False, self.embeddings
)
self.setup_search_manager()
if self.document_action == DocumentAction.Add:
files = self.list_file_strategy.list()
async for file in files:
Expand All @@ -104,18 +108,18 @@ async def run(self):
blob_image_embeddings: Optional[list[list[float]]] = None
if self.image_embeddings and blob_sas_uris:
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
await search_manager.update_content(sections, blob_image_embeddings, url=file.url)
await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url)
finally:
if file:
file.close()
elif self.document_action == DocumentAction.Remove:
paths = self.list_file_strategy.list_paths()
async for path in paths:
await self.blob_manager.remove_blob(path)
await search_manager.remove_content(path)
await self.search_manager.remove_content(path)
elif self.document_action == DocumentAction.RemoveAll:
await self.blob_manager.remove_blob()
await search_manager.remove_content()
await self.search_manager.remove_content()


class UploadUserFileStrategy:
Expand All @@ -129,12 +133,22 @@ def __init__(
file_processors: dict[str, FileProcessor],
embeddings: Optional[OpenAIEmbeddings] = None,
image_embeddings: Optional[ImageEmbeddings] = None,
search_field_name_embedding: Optional[str] = None,
):
self.file_processors = file_processors
self.embeddings = embeddings
self.image_embeddings = image_embeddings
self.search_info = search_info
self.search_manager = SearchManager(self.search_info, None, True, False, self.embeddings)
self.search_manager = SearchManager(
search_info=self.search_info,
search_analyzer_name=None,
use_acls=True,
use_int_vectorization=False,
embeddings=self.embeddings,
field_name_embedding=search_field_name_embedding,
search_images=False,
)
self.search_field_name_embedding = search_field_name_embedding

async def add_file(self, file: File):
if self.image_embeddings:
Expand Down
Loading
Loading