Skip to content

Commit 517106f

Browse files
committed
[ENH] Python client cleanup: export hosted ef from utils, fix sparse auto-embed
1 parent 61639bb commit 517106f

File tree

3 files changed

+616
-13
lines changed

3 files changed

+616
-13
lines changed

chromadb/api/models/CollectionCommon.py

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -584,15 +584,19 @@ def _apply_sparse_embeddings_to_metadatas(
584584
metadatas: Optional[List[Metadata]],
585585
documents: Optional[List[Document]] = None,
586586
) -> Optional[List[Metadata]]:
587-
if metadatas is None:
588-
return None
589-
590587
sparse_targets = self._get_sparse_embedding_targets()
591588
if not sparse_targets:
592589
return metadatas
593590

591+
# If no metadatas provided, create empty dicts based on documents length
592+
if metadatas is None:
593+
if documents is None:
594+
return None
595+
metadatas = [{} for _ in range(len(documents))]
596+
597+
# Create copies, converting None to empty dict
594598
updated_metadatas: List[Dict[str, Any]] = [
595-
dict(metadata) for metadata in metadatas
599+
dict(metadata) if metadata is not None else {} for metadata in metadatas
596600
]
597601

598602
documents_list = list(documents) if documents is not None else None
@@ -607,19 +611,51 @@ def _apply_sparse_embeddings_to_metadatas(
607611
embedding_func = cast(SparseEmbeddingFunction[Any], embedding_func)
608612
validate_sparse_embedding_function(embedding_func)
609613

614+
# Initialize collection lists for batch processing
610615
inputs: List[str] = []
611616
positions: List[int] = []
612617

618+
# Handle special case: source_key is "#document"
619+
if source_key == DOCUMENT_KEY:
620+
if documents_list is None:
621+
continue
622+
623+
# Collect documents that need embedding
624+
for idx, metadata in enumerate(updated_metadatas):
625+
# Skip if target already exists in metadata
626+
if target_key in metadata:
627+
continue
628+
629+
# Get document at this position
630+
if idx < len(documents_list):
631+
doc = documents_list[idx]
632+
if isinstance(doc, str):
633+
inputs.append(doc)
634+
positions.append(idx)
635+
636+
# Generate embeddings for all collected documents
637+
if inputs:
638+
sparse_embeddings = self._sparse_embed(
639+
input=inputs,
640+
sparse_embedding_function=embedding_func,
641+
)
642+
643+
if len(sparse_embeddings) != len(positions):
644+
raise ValueError(
645+
"Sparse embedding function returned unexpected number of embeddings."
646+
)
647+
648+
for position, embedding in zip(positions, sparse_embeddings):
649+
updated_metadatas[position][target_key] = embedding
650+
651+
continue # Skip the metadata-based logic below
652+
653+
# Handle normal case: source_key is a metadata field
613654
for idx, metadata in enumerate(updated_metadatas):
614655
if target_key in metadata:
615656
continue
616657

617-
if source_key == DOCUMENT_KEY:
618-
source_value = None
619-
if documents_list is not None and idx < len(documents_list):
620-
source_value = documents_list[idx]
621-
else:
622-
source_value = metadata.get(source_key)
658+
source_value = metadata.get(source_key)
623659
if not isinstance(source_value, str):
624660
continue
625661

@@ -642,8 +678,13 @@ def _apply_sparse_embeddings_to_metadatas(
642678
for position, embedding in zip(positions, sparse_embeddings):
643679
updated_metadatas[position][target_key] = embedding
644680

645-
validate_metadatas(cast(List[Metadata], updated_metadatas))
646-
return cast(List[Metadata], updated_metadatas)
681+
# Convert empty dicts back to None, validation requires non-empty dicts or None
682+
result_metadatas: List[Optional[Metadata]] = [
683+
metadata if metadata else None for metadata in updated_metadatas
684+
]
685+
686+
validate_metadatas(cast(List[Metadata], result_metadatas))
687+
return cast(List[Metadata], result_metadatas)
647688

648689
def _embed_record_set(
649690
self,

0 commit comments

Comments
 (0)