chroma-core · Gwenn-LR · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025
diff --git a/chromadb/utils/embedding_functions/mistral_embedding_function.py b/chromadb/utils/embedding_functions/mistral_embedding_function.py
@@ -10,13 +10,15 @@ def __init__(
         self,
         model: str,
         api_key_env_var: str = "MISTRAL_API_KEY",
+        max_chunk_size: int = 50
     ):
         """
         Initialize the MistralEmbeddingFunction.
 
         Args:
             model (str): The name of the model to use for text embeddings.
             api_key_env_var (str): The environment variable name for the Mistral API key.
+            max_chunk_size (int): The maximal size of chunk to embed. Default to 50.
         """
         try:
             from mistralai import Mistral
@@ -30,23 +32,29 @@ def __init__(
         if not self.api_key:
             raise ValueError(f"The {api_key_env_var} environment variable is not set.")
         self.client = Mistral(api_key=self.api_key)
+        self.max_chunk_size = max_chunk_size
 
     def __call__(self, input: Documents) -> Embeddings:
         """
         Get the embeddings for a list of texts.
 
+        Note:
+            Mistral AI API will provide auto-chunking in the future (https://docs.mistral.ai/capabilities/embeddings/text_embeddings/#batch-processing)
+
         Args:
             input (Documents): A list of texts to get embeddings for.
         """
         if not all(isinstance(item, str) for item in input):
             raise ValueError("Mistral only supports text documents, not images")
-        output = self.client.embeddings.create(
-            model=self.model,
-            inputs=input,
+
+        chunks = (input[x : x + self.max_chunk_size] for x in range(0, len(input), self.max_chunk_size))
-        chunks = (input[x : x + self.max_chunk_size] for x in range(0, len(input), self.max_chunk_size))
+        if not input:
+            return []
+            
+        chunks = (input[x : x + self.max_chunk_size] for x in range(0, len(input), self.max_chunk_size))
-        chunks = (input[x : x + self.max_chunk_size] for x in range(0, len(input), self.max_chunk_size))
+        if not input:
+            return []
+            
+        chunks = (input[x : x + self.max_chunk_size] for x in range(0, len(input), self.max_chunk_size))
+        embeddings_response = (
+            self.client.embeddings.create(model=self.model, inputs=c) for c in chunks
         )
 
+
         # Extract embeddings from the response
-        return [np.array(data.embedding) for data in output.data]
+        return [np.array(d.embedding) for e in embeddings_response for d in e.data]
 
     @staticmethod
     def name() -> str:
@@ -62,15 +70,17 @@ def supported_spaces(self) -> List[Space]:
     def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]":
         model = config.get("model")
         api_key_env_var = config.get("api_key_env_var")
+        max_chunk_size = config.get("max_chunk_size")
 
-        if model is None or api_key_env_var is None:
+        if model is None or api_key_env_var is None or max_chunk_size is None:
             assert False, "This code should not be reached"  # this is for type checking
-        return MistralEmbeddingFunction(model=model, api_key_env_var=api_key_env_var)
+        return MistralEmbeddingFunction(model=model, api_key_env_var=api_key_env_var, max_chunk_size=max_chunk_size)
 
     def get_config(self) -> Dict[str, Any]:
         return {
             "model": self.model,
             "api_key_env_var": self.api_key_env_var,
+            "max_chunk_size": self.max_chunk_size
         }
 
     def validate_config_update(