AmadeusITGroup · dpomian · Jul 30, 2025 · Jul 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -74,3 +74,10 @@ docs/_build
 
 # Mypy Cache
 .mypy_cache/
+
+# logs
+logs/indexer_skills.log
+src/logs/indexer_skills.log
+
+# chroma index sqlite3
+src/localhost/chroma.sqlite3
diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md
@@ -213,8 +213,27 @@ Stores embeddings in a Chroma vector store. Ideal for prototyping.
     name: chromadb
     params:
         db_path: path/to/where/your/chroma/db/is    # if you don't have any yet, a new one will be created at the specified path
-        collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inseted
+        collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inserted
 ```
+
+### FAISS
+Stores embeddings in a faiss vector store. 
+
+```yaml
+- skill: &FaissDbVectorStore
+    type: vector-store
+    name: faissdb
+    params:
+        db_path: path/to/where/your/faiss/db/is    # if you don't have any yet, a new one will be created at the specified path
+        dimension : replace-with-your-embeddings-dimension # Ensure that the correct dimension is provided. The expected dimension must match the embedding model you have selected
+        overwrite_index: true  # true - before storing data, it will remove all the documents from your index. false - will append documents to your index
+
+```     
+
+
+
+
+
 </details>
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,8 @@ dependencies = [
     "python-dotenv>=1.0.1",
     "python-pptx>=1.0.2",
     "unstructured>=0.14.8",
+    "faiss-cpu>=1.11.0",
+    "langchain_community>=0.3.18",
 ]
 
 [project.scripts]

diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -119,6 +119,9 @@ definitions:
               api_token:
                 type: string
                 required: False
+              dimension:
+                type: integer
+                required: False
               field_mapping:
                 type: dict
                 schema:

diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py
@@ -3,6 +3,17 @@
 from .document_intelligence_skill import AzureDocumentIntelligenceSkill
 from .jira_loader_skill import JiraLoaderSkill
 from .scrollwordexporter_skill import ScrollWorldExporterSkill
+from .chromadb_vector_store_skill import ChromaDBVectorStoreSkill
+from .tracker import VectorStoreTracker
+from .semantic_splitter_skill import SemanticSplitter
+from .recursive_character_splitter_skill import RecursiveCharacterTextSplitter
+from .azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
+from .default_file_reader import DefaultFileReader
+from .file_scanner_skill import FileScannerSkill
+from .llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
+from .local_document_parser import LocalDocumentParser
+from .faiss_vector_store_skill import FaissVectorStoreSkill
+
 
 __all__ = [
     "AzureAda002EmbeddingSkill",
@@ -11,4 +22,13 @@
     "JiraLoaderSkill",
     "ScrollWorldExporterSkill",
     "VectorStoreTracker",
+    "ChromaDBVectorStoreSkill",
+    "SemanticSplitter",
+    "RecursiveCharacterTextSplitter",
+    "AzureBlobStoreUploaderSkill",
+    "DefaultFileReader",
+    "FileScannerSkill",
+    "LlamaFastembedEmbeddingSkill",
+    "LocalDocumentParser",
+    "FaissVectorStoreSkill",
 ]
diff --git a/src/docs2vecs/subcommands/indexer/skills/chromadb_vector_store_skill.py b/src/docs2vecs/subcommands/indexer/skills/chromadb_vector_store_skill.py
@@ -11,16 +11,23 @@
 
 
 class ChromaDBVectorStoreSkill(IndexerSkill):
-    def __init__(self, config: dict, global_config: Config, vector_store_tracker: Optional[VectorStoreTracker] = None) -> None:
+    def __init__(
+        self,
+        config: dict,
+        global_config: Config,
+        vector_store_tracker: Optional[VectorStoreTracker] = None,
+    ) -> None:
         super().__init__(config, global_config)
         self._vector_store_tracker = vector_store_tracker
 
     def run(self, input: Optional[List[Document]] = None) -> List[Document]:
         self.logger.info("Running ChromaDBVectorStoreSkill...")
 
         db_path = Path(self._config["db_path"]).expanduser().resolve().as_posix()
-        chroma_client = self._get_client(db_path)
-        chroma_collection = chroma_client.get_or_create_collection(self._config["collection_name"])
+        chroma_client: chromadb.Client = self._get_client(db_path)
+        chroma_collection = chroma_client.get_or_create_collection(
+            self._config["collection_name"]
+        )
 
         self.logger.debug(f"Going to process {len(input)} documents")
         for doc in input:
@@ -29,7 +36,10 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
                 ids=[chunk.chunk_id for chunk in doc.chunks],
                 embeddings=[chunk.embedding for chunk in doc.chunks],
                 documents=[chunk.content for chunk in doc.chunks],
-                metadatas=[{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks],
+                metadatas=[
+                    {"source": chunk.source_link, "tags": doc.tag}
+                    for chunk in doc.chunks
+                ],
             )
 
         return input

diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py
@@ -2,19 +2,20 @@
 
 from docs2vecs.subcommands.indexer.config import Config
 from docs2vecs.subcommands.indexer.db.mongodb import MongoDbConnection
-from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import AzureAda002EmbeddingSkill
-from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
-from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import AzureVectorStoreSkill
-from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import ChromaDBVectorStoreSkill
-from docs2vecs.subcommands.indexer.skills.default_file_reader import DefaultFileReader
-from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import AzureDocumentIntelligenceSkill
-from docs2vecs.subcommands.indexer.skills.file_scanner_skill import FileScannerSkill
-from docs2vecs.subcommands.indexer.skills.jira_loader_skill import JiraLoaderSkill
-from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
-from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import RecursiveCharacterTextSplitter
-from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import ScrollWorldExporterSkill
-from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import SemanticSplitter
-from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
+from docs2vecs.subcommands.indexer.skills import AzureAda002EmbeddingSkill
+from docs2vecs.subcommands.indexer.skills import AzureBlobStoreUploaderSkill
+from docs2vecs.subcommands.indexer.skills import AzureVectorStoreSkill
+from docs2vecs.subcommands.indexer.skills import ChromaDBVectorStoreSkill
+from docs2vecs.subcommands.indexer.skills import DefaultFileReader
+from docs2vecs.subcommands.indexer.skills import AzureDocumentIntelligenceSkill
+from docs2vecs.subcommands.indexer.skills import FileScannerSkill
+from docs2vecs.subcommands.indexer.skills import JiraLoaderSkill
+from docs2vecs.subcommands.indexer.skills import LlamaFastembedEmbeddingSkill
+from docs2vecs.subcommands.indexer.skills import RecursiveCharacterTextSplitter
+from docs2vecs.subcommands.indexer.skills import ScrollWorldExporterSkill
+from docs2vecs.subcommands.indexer.skills import SemanticSplitter
+from docs2vecs.subcommands.indexer.skills import VectorStoreTracker
+from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill
 
 
 class SkillType(StrEnum):
@@ -42,6 +43,7 @@ class AvailableSkillName(StrEnum):
     # vector stores
     AZ_AISearch = "azure-ai-search"
     CHROMADB = "chromadb"
+    FAISSDB = "faissdb"
 
     # uplaoders
     AZ_BLOB_STORE = "azure-blob-store"
@@ -74,6 +76,7 @@ class AvailableSkillName(StrEnum):
     SkillType.VECTOR_STORE: {
         AvailableSkillName.AZ_AISearch: AzureVectorStoreSkill,
         AvailableSkillName.CHROMADB: ChromaDBVectorStoreSkill,
+        AvailableSkillName.FAISSDB: FaissVectorStoreSkill,
     },
     SkillType.UPLOADER: {AvailableSkillName.AZ_BLOB_STORE: AzureBlobStoreUploaderSkill},
     SkillType.SPLITTER: {
@@ -90,9 +93,13 @@ def get_skill(cls, skill_config_dict: dict, global_config: Config):
         try:
             skill_type = SkillType(skill_config_dict["type"])
             avail_skill_name = AvailableSkillName(skill_config_dict["name"])
-            return AVAILABLE_SKILLS[skill_type][avail_skill_name](skill_config_dict, global_config)
+            return AVAILABLE_SKILLS[skill_type][avail_skill_name](
+                skill_config_dict, global_config
+            )
         except ValueError as error:
-            raise ValueError(f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}") from error
+            raise ValueError(
+                f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}"
+            ) from error
 
 
 class TrackerFactory:

diff --git a/src/docs2vecs/subcommands/indexer/skills/faiss_vector_store_skill.py b/src/docs2vecs/subcommands/indexer/skills/faiss_vector_store_skill.py
@@ -0,0 +1,129 @@
+from pathlib import Path
+from typing import List
+from typing import Dict
+from typing import Optional
+from typing import Any
+
+import faiss
+import os
+from docs2vecs.subcommands.indexer.config.config import Config
+from docs2vecs.subcommands.indexer.document.document import Document
+from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
+from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
+from langchain_community.vectorstores import FAISS
+from langchain_community.docstore.in_memory import InMemoryDocstore
+
+
+class FaissVectorStoreSkill(IndexerSkill):
+    """
+    Faiss vector store skill for storing and retrieving document embeddings.
+    Supports flat L2 indexing
+    """
+
+    def __init__(
+        self,
+        config: Dict[str, Any],
+        global_config: Config,
+        vector_store_tracker: Optional[VectorStoreTracker] = None,
+    ) -> None:
+        super().__init__(config, global_config)
+        self._vector_store_tracker = vector_store_tracker
+        self._overwrite_index = self._config.get("overwrite_index", False)
+        self._VECTOR_DIMENSION = self._config.get("dimension")
+        self.faiss_index = faiss.IndexFlatL2(self._VECTOR_DIMENSION)
+
+    def run(self, input: Optional[List[Document]] = None) -> List[Document]:
+        self.logger.info("Running FaissVectorStoreSkill...")
+        db_path = Path(self._config.get("db_path")).expanduser().resolve().as_posix()
+        # load or create the vector store
+        vector_store = self._get_vector_store(db_path, input)
+        # Get existing IDs as a set
+        existing_ids = list(vector_store.index_to_docstore_id.values())
+
+        if self._overwrite_index and existing_ids:
+            self.logger.info("Overwriting existing index.")
+            vector_store.delete(ids=existing_ids)
+
+        for doc in input:
+            self.logger.info(f"Processing document: {doc.filename}")
+            ids = [chunk.chunk_id for chunk in doc.chunks]
+            embeddings = [chunk.embedding for chunk in doc.chunks]
+            documents = [chunk.content for chunk in doc.chunks]
+            metadatas = [
+                {"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks
+            ]
+
+            embeddings_to_add = []
+            metadatas_to_add = []
+            documents_to_add = []
+            ids_to_add = []
+            self.logger.debug(f"ids in the processed file are : {ids}")
+            self.logger.debug(
+                f"the value of overwrite_index is : {self._overwrite_index}"
+            )
+
+            if self._overwrite_index:
+                ids_to_add = ids
+                embeddings_to_add = embeddings
+                metadatas_to_add = metadatas
+                documents_to_add = documents
+
+            elif ids:
+                for id in ids:
+                    if id not in existing_ids:
+                        self.logger.info(
+                            f"ID {id} does not exist in the index, adding it."
+                        )
+                        embeddings_to_add.append(embeddings[ids.index(id)])
+                        metadatas_to_add.append(metadatas[ids.index(id)])
+                        documents_to_add.append(documents[ids.index(id)])
+                        ids_to_add.append(id)
+
+            if ids_to_add:
+                self.logger.info(
+                    f"Adding {len(ids_to_add)} new embeddings to the vector store."
+                )
+                vector_store.add_embeddings(
+                    text_embeddings=zip(documents, embeddings),
+                    metadatas=metadatas,
+                    ids=ids_to_add,
+                )
+            else:
+                self.logger.info("No new embeddings to add (all ids already exist).")
+
+        vector_store.save_local(db_path)
+
+        return input
+
+    def _get_embeddings(self, input: Optional[List[Document]] = None) -> List[float]:
+        data = []
+        for doc in input:
+            self.logger.debug(f"Processing document: {doc.filename}")
+            for chunk in doc.chunks:
+                data.append(chunk.embedding)
+        return data
+
+    def _get_vector_store(
+        self, db_path: Path, input: Optional[List[Document]] = None
+    ) -> FAISS:
+        index_path = os.path.join(db_path, "index.faiss")
+
+        if os.path.exists(index_path):
+            self.logger.info(f"FAISS index found at {index_path}.")
+            vector_store = FAISS.load_local(
+                db_path,
+                embeddings=self._get_embeddings(input),
+                allow_dangerous_deserialization=True,
+            )
+
+        else:
+            self.logger.info(
+                f"FAISS index not found at {index_path}. Creating a new one."
+            )
+            vector_store = FAISS(
+                index=self.faiss_index,
+                embedding_function=self._get_embeddings(input),
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+            )
+        return vector_store
diff --git a/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py b/src/docs2vecs/subcommands/indexer/skills/recursive_character_splitter_skill.py
@@ -12,16 +12,19 @@
 
 
 class RecursiveCharacterTextSplitter(IndexerSkill):
+    DEFAULT_CHUNK_SIZE = 1000
+    DEFAULT_CHUNK_OVERLAP = 100
+
     def __init__(self, config: dict, global_config: Config):
         super().__init__(config, global_config)
         self._set_config_defaults()
 
     def _set_config_defaults(self):
         if "chunk_size" not in self._config:
-            self._config["chunk_size"] = 1000
+            self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE
 
         if "chunk_overlap" not in self._config:
-            self._config["chunk_overlap"] = 100
+            self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP
 
     def run(self, input: Optional[List[Document]] = None) -> List[Document]:
         self.logger.info("Running RecursiveCharacterTextSplitter...")

diff --git a/tests/test_data/test_file.txt b/tests/test_data/test_file.txt
@@ -0,0 +1 @@
+Hi, this is a test file version 1.