Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,10 @@ docs/_build

# Mypy Cache
.mypy_cache/

# logs
logs/indexer_skills.log
src/logs/indexer_skills.log

# chroma index sqlite3
src/localhost/chroma.sqlite3
21 changes: 20 additions & 1 deletion docs/readme/indexer-skills.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,27 @@ Stores embeddings in a Chroma vector store. Ideal for prototyping.
name: chromadb
params:
db_path: path/to/where/your/chroma/db/is # if you don't have any yet, a new one will be created at the specified path
collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inseted
collection_name: replace-this-with-your-collection-name # if you don't have a collection yet, a new one will be created when documents are inserted
```

### FAISS
Stores embeddings in a faiss vector store.

```yaml
- skill: &FaissDbVectorStore
type: vector-store
name: faissdb
params:
db_path: path/to/where/your/faiss/db/is # if you don't have any yet, a new one will be created at the specified path
dimension : replace-with-your-embeddings-dimension # Ensure that the correct dimension is provided. The expected dimension must match the embedding model you have selected
overwrite_index: true # true - before storing data, it will remove all the documents from your index. false - will append documents to your index

```





</details>


Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ dependencies = [
"python-dotenv>=1.0.1",
"python-pptx>=1.0.2",
"unstructured>=0.14.8",
"faiss-cpu>=1.11.0",
"langchain_community>=0.3.18",
]

[project.scripts]
Expand Down
3 changes: 3 additions & 0 deletions src/docs2vecs/subcommands/indexer/config/config_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ definitions:
api_token:
type: string
required: False
dimension:
type: integer
required: False
field_mapping:
type: dict
schema:
Expand Down
20 changes: 20 additions & 0 deletions src/docs2vecs/subcommands/indexer/skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@
from .document_intelligence_skill import AzureDocumentIntelligenceSkill
from .jira_loader_skill import JiraLoaderSkill
from .scrollwordexporter_skill import ScrollWorldExporterSkill
from .chromadb_vector_store_skill import ChromaDBVectorStoreSkill
from .tracker import VectorStoreTracker
from .semantic_splitter_skill import SemanticSplitter
from .recursive_character_splitter_skill import RecursiveCharacterTextSplitter
from .azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
from .default_file_reader import DefaultFileReader
from .file_scanner_skill import FileScannerSkill
from .llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from .local_document_parser import LocalDocumentParser
from .faiss_vector_store_skill import FaissVectorStoreSkill


__all__ = [
"AzureAda002EmbeddingSkill",
Expand All @@ -11,4 +22,13 @@
"JiraLoaderSkill",
"ScrollWorldExporterSkill",
"VectorStoreTracker",
"ChromaDBVectorStoreSkill",
"SemanticSplitter",
"RecursiveCharacterTextSplitter",
"AzureBlobStoreUploaderSkill",
"DefaultFileReader",
"FileScannerSkill",
"LlamaFastembedEmbeddingSkill",
"LocalDocumentParser",
"FaissVectorStoreSkill",
]
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,23 @@


class ChromaDBVectorStoreSkill(IndexerSkill):
def __init__(self, config: dict, global_config: Config, vector_store_tracker: Optional[VectorStoreTracker] = None) -> None:
def __init__(
self,
config: dict,
global_config: Config,
vector_store_tracker: Optional[VectorStoreTracker] = None,
) -> None:
super().__init__(config, global_config)
self._vector_store_tracker = vector_store_tracker

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running ChromaDBVectorStoreSkill...")

db_path = Path(self._config["db_path"]).expanduser().resolve().as_posix()
chroma_client = self._get_client(db_path)
chroma_collection = chroma_client.get_or_create_collection(self._config["collection_name"])
chroma_client: chromadb.Client = self._get_client(db_path)
chroma_collection = chroma_client.get_or_create_collection(
self._config["collection_name"]
)

self.logger.debug(f"Going to process {len(input)} documents")
for doc in input:
Expand All @@ -29,7 +36,10 @@ def run(self, input: Optional[List[Document]] = None) -> List[Document]:
ids=[chunk.chunk_id for chunk in doc.chunks],
embeddings=[chunk.embedding for chunk in doc.chunks],
documents=[chunk.content for chunk in doc.chunks],
metadatas=[{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks],
metadatas=[
{"source": chunk.source_link, "tags": doc.tag}
for chunk in doc.chunks
],
)

return input
Expand Down
37 changes: 22 additions & 15 deletions src/docs2vecs/subcommands/indexer/skills/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@

from docs2vecs.subcommands.indexer.config import Config
from docs2vecs.subcommands.indexer.db.mongodb import MongoDbConnection
from docs2vecs.subcommands.indexer.skills.ada002_embedding_skill import AzureAda002EmbeddingSkill
from docs2vecs.subcommands.indexer.skills.azure_blob_store_uploader_skill import AzureBlobStoreUploaderSkill
from docs2vecs.subcommands.indexer.skills.azure_vector_store_skill import AzureVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.chromadb_vector_store_skill import ChromaDBVectorStoreSkill
from docs2vecs.subcommands.indexer.skills.default_file_reader import DefaultFileReader
from docs2vecs.subcommands.indexer.skills.document_intelligence_skill import AzureDocumentIntelligenceSkill
from docs2vecs.subcommands.indexer.skills.file_scanner_skill import FileScannerSkill
from docs2vecs.subcommands.indexer.skills.jira_loader_skill import JiraLoaderSkill
from docs2vecs.subcommands.indexer.skills.llama_fastembed_embedding_skill import LlamaFastembedEmbeddingSkill
from docs2vecs.subcommands.indexer.skills.recursive_character_splitter_skill import RecursiveCharacterTextSplitter
from docs2vecs.subcommands.indexer.skills.scrollwordexporter_skill import ScrollWorldExporterSkill
from docs2vecs.subcommands.indexer.skills.semantic_splitter_skill import SemanticSplitter
from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills import AzureAda002EmbeddingSkill
from docs2vecs.subcommands.indexer.skills import AzureBlobStoreUploaderSkill
from docs2vecs.subcommands.indexer.skills import AzureVectorStoreSkill
from docs2vecs.subcommands.indexer.skills import ChromaDBVectorStoreSkill
from docs2vecs.subcommands.indexer.skills import DefaultFileReader
from docs2vecs.subcommands.indexer.skills import AzureDocumentIntelligenceSkill
from docs2vecs.subcommands.indexer.skills import FileScannerSkill
from docs2vecs.subcommands.indexer.skills import JiraLoaderSkill
from docs2vecs.subcommands.indexer.skills import LlamaFastembedEmbeddingSkill
from docs2vecs.subcommands.indexer.skills import RecursiveCharacterTextSplitter
from docs2vecs.subcommands.indexer.skills import ScrollWorldExporterSkill
from docs2vecs.subcommands.indexer.skills import SemanticSplitter
from docs2vecs.subcommands.indexer.skills import VectorStoreTracker
from docs2vecs.subcommands.indexer.skills import FaissVectorStoreSkill


class SkillType(StrEnum):
Expand Down Expand Up @@ -42,6 +43,7 @@ class AvailableSkillName(StrEnum):
# vector stores
AZ_AISearch = "azure-ai-search"
CHROMADB = "chromadb"
FAISSDB = "faissdb"

# uplaoders
AZ_BLOB_STORE = "azure-blob-store"
Expand Down Expand Up @@ -74,6 +76,7 @@ class AvailableSkillName(StrEnum):
SkillType.VECTOR_STORE: {
AvailableSkillName.AZ_AISearch: AzureVectorStoreSkill,
AvailableSkillName.CHROMADB: ChromaDBVectorStoreSkill,
AvailableSkillName.FAISSDB: FaissVectorStoreSkill,
},
SkillType.UPLOADER: {AvailableSkillName.AZ_BLOB_STORE: AzureBlobStoreUploaderSkill},
SkillType.SPLITTER: {
Expand All @@ -90,9 +93,13 @@ def get_skill(cls, skill_config_dict: dict, global_config: Config):
try:
skill_type = SkillType(skill_config_dict["type"])
avail_skill_name = AvailableSkillName(skill_config_dict["name"])
return AVAILABLE_SKILLS[skill_type][avail_skill_name](skill_config_dict, global_config)
return AVAILABLE_SKILLS[skill_type][avail_skill_name](
skill_config_dict, global_config
)
except ValueError as error:
raise ValueError(f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}") from error
raise ValueError(
f"Unknown skill of type: {skill_config_dict['type']}, and name: {skill_config_dict['name']}"
) from error


class TrackerFactory:
Expand Down
129 changes: 129 additions & 0 deletions src/docs2vecs/subcommands/indexer/skills/faiss_vector_store_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from pathlib import Path
from typing import List
from typing import Dict
from typing import Optional
from typing import Any

import faiss
import os
from docs2vecs.subcommands.indexer.config.config import Config
from docs2vecs.subcommands.indexer.document.document import Document
from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
from docs2vecs.subcommands.indexer.skills.tracker import VectorStoreTracker
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore


class FaissVectorStoreSkill(IndexerSkill):
"""
Faiss vector store skill for storing and retrieving document embeddings.
Supports flat L2 indexing
"""

def __init__(
self,
config: Dict[str, Any],
global_config: Config,
vector_store_tracker: Optional[VectorStoreTracker] = None,
) -> None:
super().__init__(config, global_config)
self._vector_store_tracker = vector_store_tracker
self._overwrite_index = self._config.get("overwrite_index", False)
self._VECTOR_DIMENSION = self._config.get("dimension")
self.faiss_index = faiss.IndexFlatL2(self._VECTOR_DIMENSION)

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running FaissVectorStoreSkill...")
db_path = Path(self._config.get("db_path")).expanduser().resolve().as_posix()
# load or create the vector store
vector_store = self._get_vector_store(db_path, input)
# Get existing IDs as a set
existing_ids = list(vector_store.index_to_docstore_id.values())

if self._overwrite_index and existing_ids:
self.logger.info("Overwriting existing index.")
vector_store.delete(ids=existing_ids)

for doc in input:
self.logger.info(f"Processing document: {doc.filename}")
ids = [chunk.chunk_id for chunk in doc.chunks]
embeddings = [chunk.embedding for chunk in doc.chunks]
documents = [chunk.content for chunk in doc.chunks]
metadatas = [
{"source": chunk.source_link, "tags": doc.tag} for chunk in doc.chunks
]

embeddings_to_add = []
metadatas_to_add = []
documents_to_add = []
ids_to_add = []
self.logger.debug(f"ids in the processed file are : {ids}")
self.logger.debug(
f"the value of overwrite_index is : {self._overwrite_index}"
)

if self._overwrite_index:
ids_to_add = ids
embeddings_to_add = embeddings
metadatas_to_add = metadatas
documents_to_add = documents

elif ids:
for id in ids:
if id not in existing_ids:
self.logger.info(
f"ID {id} does not exist in the index, adding it."
)
embeddings_to_add.append(embeddings[ids.index(id)])
metadatas_to_add.append(metadatas[ids.index(id)])
documents_to_add.append(documents[ids.index(id)])
ids_to_add.append(id)

if ids_to_add:
self.logger.info(
f"Adding {len(ids_to_add)} new embeddings to the vector store."
)
vector_store.add_embeddings(
text_embeddings=zip(documents, embeddings),
metadatas=metadatas,
ids=ids_to_add,
)
else:
self.logger.info("No new embeddings to add (all ids already exist).")

vector_store.save_local(db_path)

return input

def _get_embeddings(self, input: Optional[List[Document]] = None) -> List[float]:
data = []
for doc in input:
self.logger.debug(f"Processing document: {doc.filename}")
for chunk in doc.chunks:
data.append(chunk.embedding)
return data

def _get_vector_store(
self, db_path: Path, input: Optional[List[Document]] = None
) -> FAISS:
index_path = os.path.join(db_path, "index.faiss")

if os.path.exists(index_path):
self.logger.info(f"FAISS index found at {index_path}.")
vector_store = FAISS.load_local(
db_path,
embeddings=self._get_embeddings(input),
allow_dangerous_deserialization=True,
)

else:
self.logger.info(
f"FAISS index not found at {index_path}. Creating a new one."
)
vector_store = FAISS(
index=self.faiss_index,
embedding_function=self._get_embeddings(input),
docstore=InMemoryDocstore(),
index_to_docstore_id={},
)
return vector_store
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@


class RecursiveCharacterTextSplitter(IndexerSkill):
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_CHUNK_OVERLAP = 100

def __init__(self, config: dict, global_config: Config):
super().__init__(config, global_config)
self._set_config_defaults()

def _set_config_defaults(self):
if "chunk_size" not in self._config:
self._config["chunk_size"] = 1000
self._config["chunk_size"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_SIZE

if "chunk_overlap" not in self._config:
self._config["chunk_overlap"] = 100
self._config["chunk_overlap"] = RecursiveCharacterTextSplitter.DEFAULT_CHUNK_OVERLAP

def run(self, input: Optional[List[Document]] = None) -> List[Document]:
self.logger.info("Running RecursiveCharacterTextSplitter...")
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/test_file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hi, this is a test file version 1.
Loading