Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ updates:
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
interval: "weekly"
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,10 @@ docs/_build

# Mypy Cache
.mypy_cache/

# logs
logs/indexer_skills.log
src/logs/indexer_skills.log

# chroma index sqlite3
src/localhost/chroma.sqlite3
22 changes: 21 additions & 1 deletion docs/readme/indexer-skills.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,28 @@ Stores embeddings in a Chroma vector store. Ideal for prototyping.
name: chromadb
params:
db_path: path/to/where/your/chroma/db/is # if you don't have any yet, a new one will be created at the specified path
collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inseted
collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inserted
```

### FAISS
Stores embeddings in a faiss vector store.

```yaml
- skill: &FaissDbVectorStore
type: vector-store
name: faissdb
params:
db_path: path/to/where/your/faiss/db/is # if you don't have any yet, a new one will be created at the specified path
collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inserted
dimension : replace-with-your-embeddings-dimension # Ensure the correct dimensions are provided; otherwise, an error will occur.
overwrite_index: true # true - before storing data, it will remove all the documents from your index. false - will append documents to your index

```





</details>


Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ dependencies = [
"python-dotenv>=1.0.1",
"python-pptx>=1.0.2",
"unstructured>=0.14.8",
"faiss-cpu>=1.11.0",
"langchain_community>=0.3.18",
]

[project.scripts]
Expand Down
5 changes: 4 additions & 1 deletion src/docs2vecs/subcommands/indexer/config/config_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ definitions:
api_token:
type: string
required: False
dimension:
type: integer
required: False
field_mapping:
type: dict
schema:
Expand Down Expand Up @@ -207,4 +210,4 @@ indexer:
required: True
tracker:
type: dict
required: False
required: False
22 changes: 21 additions & 1 deletion src/docs2vecs/subcommands/indexer/skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
from .document_intelligence_skill import AzureDocumentIntelligenceSkill
from .jira_loader_skill import JiraLoaderSkill
from .scrollwordexporter_skill import ScrollWorldExporterSkill
from .chromadb_vector_store_skill import ChromaDBVectorStoreSkill
from .tracker import VectorStoreTracker
from .semantic_splitter_skill import SemanticSplitter
from .recursive_character_splitter_skill import RecursiveCharacterTextSplitter
from .azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
from .default_file_reader import DefaultFileReader
from .file_scanner_skill import FileScannerSkill
from .llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from .local_document_parser import LocalDocumentParser
from .faiss_vector_store_skill import FaissVectorStoreSkill


__all__ = [
"AzureAda002EmbeddingSkill",
Expand All @@ -11,4 +22,13 @@
"JiraLoaderSkill",
"ScrollWorldExporterSkill",
"VectorStoreTracker",
]
"ChromaDBVectorStoreSkill",
"SemanticSplitter",
"RecursiveCharacterTextSplitter",
"AzureBlobStoreUploaderSkill",
"DefaultFileReader",
"FileScannerSkill",
"LlamaFastembedEmbeddingSkill",
"LocalDocumentParser",
"FaissVectorStoreSkill"
]
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running ChromaDBVectorStoreSkill...")

db_path = Path(self._config["db_path"]).expanduser().resolve().as_posix()
chroma_client = self._get_client(db_path)
chroma_client: chromadb.Client = self._get_client(db_path)
chroma_collection = chroma_client.get_or_create_collection(self._config["collection_name"])

self.logger.debug(f"Going to process {len(input)} documents")
Expand All @@ -35,4 +35,4 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
return input

def _get_client(self, db_path: str) -> chromadb.Client:
return chromadb.PersistentClient(path=db_path)
return chromadb.PersistentClient(path=db_path)
31 changes: 17 additions & 14 deletions src/docs2vecs/subcommands/indexer/skills/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@

from docs2vecs.subcommands.indexer.config import Config
from docs2vecs.subcommands.indexer.db.mongodb import MongoDbConnection
from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import AzureAda002EmbeddingSkill
from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import AzureVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import ChromaDBVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.default_file_reader import DefaultFileReader
from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import AzureDocumentIntelligenceSkill
from docs2vecs.subcommands.indexer.skills.file_scanner_skill import FileScannerSkill
from docs2vecs.subcommands.indexer.skills.jira_loader_skill import JiraLoaderSkill
from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import RecursiveCharacterTextSplitter
from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import ScrollWorldExporterSkill
from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import SemanticSplitter
from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills import AzureAda002EmbeddingSkill
from docs2vecs.subcommands.indexer.skills import AzureBlobStoreUploaderSkill
from docs2vecs.subcommands.indexer.skills import AzureVectorStoreSkill
from docs2vecs.subcommands.indexer.skills import ChromaDBVectorStoreSkill
from docs2vecs.subcommands.indexer.skills import DefaultFileReader
from docs2vecs.subcommands.indexer.skills import AzureDocumentIntelligenceSkill
from docs2vecs.subcommands.indexer.skills import FileScannerSkill
from docs2vecs.subcommands.indexer.skills import JiraLoaderSkill
from docs2vecs.subcommands.indexer.skills import LlamaFastembedEmbeddingSkill
from docs2vecs.subcommands.indexer.skills import RecursiveCharacterTextSplitter
from docs2vecs.subcommands.indexer.skills import ScrollWorldExporterSkill
from docs2vecs.subcommands.indexer.skills import SemanticSplitter
from docs2vecs.subcommands.indexer.skills import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill


class SkillType(StrEnum):
Expand Down Expand Up @@ -42,6 +43,7 @@ class AvailableSkillName(StrEnum):
# vector stores
AZ_AISearch = "azure-ai-search"
CHROMADB = "chromadb"
FAISSDB = "faissdb"

# uplaoders
AZ_BLOB_STORE = "azure-blob-store"
Expand Down Expand Up @@ -74,6 +76,7 @@ class AvailableSkillName(StrEnum):
SkillType.VECTOR_STORE: {
AvailableSkillName.AZ_AISearch: AzureVectorStoreSkill,
AvailableSkillName.CHROMADB: ChromaDBVectorStoreSkill,
AvailableSkillName.FAISSDB: FaissVectorStoreSkill,
},
SkillType.UPLOADER: {AvailableSkillName.AZ_BLOB_STORE: AzureBlobStoreUploaderSkill},
SkillType.SPLITTER: {
Expand Down Expand Up @@ -114,4 +117,4 @@ def get_db(db_config_dict: dict, global_config: Config = None):
db_name=db_config_dict["db_name"],
col_name=db_config_dict["collection_name"],
)
raise ValueError(f"Unknown db type: {db_config_dict['type']}")
raise ValueError(f"Unknown db type: {db_config_dict['type']}")
130 changes: 130 additions & 0 deletions src/docs2vecs/subcommands/indexer/skills/faiss_vector_store_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
from pathlib import Path
from typing import List
from typing import Dict
from typing import Optional
from typing import Any

import faiss
import os
from docs2vecs.subcommands.indexer.config.config import Config
from docs2vecs.subcommands.indexer.document.document import Document
from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


class FaissVectorStoreSkill(IndexerSkill):
"""
Faiss vector store skill for storing and retrieving document embeddings.
Supports flat L2 indexing
"""

def __init__(
self,
config: Dict[str, Any],
global_config: Config,
vector_store_tracker: Optional[VectorStoreTracker] = None,
) -> None:
super().__init__(config, global_config)
self._vector_store_tracker = vector_store_tracker
self._overwrite_index = self._config.get("overwrite_index", False)
self._VECTOR_DIMENSION = self._config.get("dimension")
self.faiss_index = faiss.IndexFlatL2(self._VECTOR_DIMENSION)

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running FaissVectorStoreSkill...")
db_path = Path(self._config.get("db_path")).expanduser().resolve().as_posix()

for doc in input:
self.logger.info(f"Processing document: {doc.filename}")
ids = [chunk.chunk_id for chunk in doc.chunks]
embeddings = [chunk.embedding for chunk in doc.chunks]
documents = [chunk.content for chunk in doc.chunks]
metadatas = [
{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks
]

embeddings_to_add = []
metadatas_to_add = []
documents_to_add = []
ids_to_add = []
self.logger.debug(f"ids in the processed file are : {ids}")
self.logger.debug(
f"the value of overwrite_index is : {self._overwrite_index}"
)
# load or create the vector store
vector_store = self._get_vector_store(db_path, input)
# Get existing IDs as a set
existing_ids = list(vector_store.index_to_docstore_id.values())

if self._overwrite_index:
ids_to_add = ids
embeddings_to_add = embeddings
metadatas_to_add = metadatas
documents_to_add = documents
if existing_ids:
self.logger.info("Deleting existing index.")
vector_store.delete(ids=existing_ids)
self.logger.debug(f"ids to add are : {ids_to_add}")

elif ids:
for id in ids:
if id not in existing_ids:
self.logger.debug(
f"ID {id} does not exist in the index, adding it."
)
self.logger.debug(
f"the list of ids {ids}, list of embeddings {embeddings}, list of metadatas {metadatas}, list of documents {documents}"
)
embeddings_to_add.append(embeddings[ids.index(id)])
metadatas_to_add.append(metadatas[ids.index(id)])
documents_to_add.append(documents[ids.index(id)])
ids_to_add.append(id)

if ids_to_add:
self.logger.info(
f"Adding {len(ids_to_add)} new embeddings to the vector store."
)
vector_store.add_embeddings(
text_embeddings=zip(documents, embeddings),
metadatas=metadatas,
ids=ids_to_add,
)
else:
self.logger.info("No new embeddings to add (all ids already exist).")

vector_store.save_local(db_path)

return input

def _get_embeddings(self, input: Optional[List[Document]] = None) -> List[float]:
data = []
for doc in input:
self.logger.debug(f"Processing document: {doc.filename}")
for chunk in doc.chunks:
data.append(chunk.embedding)
return data

def _get_vector_store(
self, db_path: Path, input: Optional[List[Document]] = None
) -> FAISS:
index_path = os.path.join(db_path, "index.faiss")

if os.path.exists(index_path):
self.logger.info(f"FAISS index found at {index_path}.")
vector_store = FAISS.load_local(
db_path,
embeddings=self._get_embeddings(input),
allow_dangerous_deserialization=True,
)

else:
self.logger.info(f"FAISS index not found at {index_path}.")
vector_store = FAISS(
index=self.faiss_index,
embedding_function=self._get_embeddings(input),
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
return vector_store
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@


class RecursiveCharacterTextSplitter(IndexerSkill):
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_CHUNK_OVERLAP = 100

def __init__(self, config: dict, global_config: Config):
super().__init__(config, global_config)
self._set_config_defaults()

def _set_config_defaults(self):
if "chunk_size" not in self._config:
self._config["chunk_size"] = 1000
self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE

if "chunk_overlap" not in self._config:
self._config["chunk_overlap"] = 100
self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running RecursiveCharacterTextSplitter...")
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/test_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hi, this is a test file version 1.
Loading