@@ -584,15 +584,19 @@ def _apply_sparse_embeddings_to_metadatas(
584584 metadatas : Optional [List [Metadata ]],
585585 documents : Optional [List [Document ]] = None ,
586586 ) -> Optional [List [Metadata ]]:
587- if metadatas is None :
588- return None
589-
590587 sparse_targets = self ._get_sparse_embedding_targets ()
591588 if not sparse_targets :
592589 return metadatas
593590
591+ # If no metadatas provided, create empty dicts based on documents length
592+ if metadatas is None :
593+ if documents is None :
594+ return None
595+ metadatas = [{} for _ in range (len (documents ))]
596+
597+ # Create copies, converting None to empty dict
594598 updated_metadatas : List [Dict [str , Any ]] = [
595- dict (metadata ) for metadata in metadatas
599+ dict (metadata ) if metadata is not None else {} for metadata in metadatas
596600 ]
597601
598602 documents_list = list (documents ) if documents is not None else None
@@ -607,19 +611,51 @@ def _apply_sparse_embeddings_to_metadatas(
607611 embedding_func = cast (SparseEmbeddingFunction [Any ], embedding_func )
608612 validate_sparse_embedding_function (embedding_func )
609613
614+ # Initialize collection lists for batch processing
610615 inputs : List [str ] = []
611616 positions : List [int ] = []
612617
618+ # Handle special case: source_key is "#document"
619+ if source_key == DOCUMENT_KEY :
620+ if documents_list is None :
621+ continue
622+
623+ # Collect documents that need embedding
624+ for idx , metadata in enumerate (updated_metadatas ):
625+ # Skip if target already exists in metadata
626+ if target_key in metadata :
627+ continue
628+
629+ # Get document at this position
630+ if idx < len (documents_list ):
631+ doc = documents_list [idx ]
632+ if isinstance (doc , str ):
633+ inputs .append (doc )
634+ positions .append (idx )
635+
636+ # Generate embeddings for all collected documents
637+ if inputs :
638+ sparse_embeddings = self ._sparse_embed (
639+ input = inputs ,
640+ sparse_embedding_function = embedding_func ,
641+ )
642+
643+ if len (sparse_embeddings ) != len (positions ):
644+ raise ValueError (
645+ "Sparse embedding function returned unexpected number of embeddings."
646+ )
647+
648+ for position , embedding in zip (positions , sparse_embeddings ):
649+ updated_metadatas [position ][target_key ] = embedding
650+
651+ continue # Skip the metadata-based logic below
652+
653+ # Handle normal case: source_key is a metadata field
613654 for idx , metadata in enumerate (updated_metadatas ):
614655 if target_key in metadata :
615656 continue
616657
617- if source_key == DOCUMENT_KEY :
618- source_value = None
619- if documents_list is not None and idx < len (documents_list ):
620- source_value = documents_list [idx ]
621- else :
622- source_value = metadata .get (source_key )
658+ source_value = metadata .get (source_key )
623659 if not isinstance (source_value , str ):
624660 continue
625661
@@ -642,8 +678,13 @@ def _apply_sparse_embeddings_to_metadatas(
642678 for position , embedding in zip (positions , sparse_embeddings ):
643679 updated_metadatas [position ][target_key ] = embedding
644680
645- validate_metadatas (cast (List [Metadata ], updated_metadatas ))
646- return cast (List [Metadata ], updated_metadatas )
681+ # Convert empty dicts back to None, validation requires non-empty dicts or None
682+ result_metadatas : List [Optional [Metadata ]] = [
683+ metadata if metadata else None for metadata in updated_metadatas
684+ ]
685+
686+ validate_metadatas (cast (List [Metadata ], result_metadatas ))
687+ return cast (List [Metadata ], result_metadatas )
647688
648689 def _embed_record_set (
649690 self ,
0 commit comments