diff --git a/crewai_tools/__init__.py b/crewai_tools/__init__.py index 85fe5ed6..e610d1a7 100644 --- a/crewai_tools/__init__.py +++ b/crewai_tools/__init__.py @@ -83,10 +83,13 @@ SnowflakeConfig, SnowflakeSearchTool, SpiderTool, + SpladeSparseEmbedder, StagehandTool, TavilyExtractorTool, TavilySearchTool, TXTSearchTool, + VectorXVectorSearchTool, + VectorXSearchArgs, VisionTool, WeaviateVectorSearchTool, WebsiteSearchTool, diff --git a/crewai_tools/tools/__init__.py b/crewai_tools/tools/__init__.py index 2b0bb968..23aa999f 100644 --- a/crewai_tools/tools/__init__.py +++ b/crewai_tools/tools/__init__.py @@ -122,6 +122,11 @@ ) from .youtube_video_search_tool.youtube_video_search_tool import YoutubeVideoSearchTool from .zapier_action_tool.zapier_action_tool import ZapierActionTools +from .vectorx_vector_search_tool.vectorx_search_tool import ( + VectorXVectorSearchTool, + VectorXSearchArgs, + SpladeSparseEmbedder, +) from .parallel_tools import ( ParallelSearchTool, ) diff --git a/crewai_tools/tools/vectorx_vector_search_tool/README.md b/crewai_tools/tools/vectorx_vector_search_tool/README.md new file mode 100644 index 00000000..16f0b2c8 --- /dev/null +++ b/crewai_tools/tools/vectorx_vector_search_tool/README.md @@ -0,0 +1,163 @@ +# VectorX Search Tool for CrewAI + +This repository provides a **CrewAI tool integration** for [VectorX](https://vectorxdb.ai), enabling **semantic search** and optional **hybrid (dense + sparse) retrieval** inside CrewAI workflows. +It uses **Google Gemini embeddings** by default for dense vectors and supports **SPLADE** for sparse vectors. + +--- + +## Features + +* 🔹 **Dense search** using Gemini (`gemini-embedding-001` by default, user-configurable) +* 🔹 **Hybrid search** (dense + sparse) with support for custom sparse models by default, [prithivida/Splade\_PP\_en\_v1](https://huggingface.co/prithivida/Splade_PP_en_v1) +* 🔹 Seamless integration with **CrewAI Agents** +* 🔹 Document upsert and query +* 🔹 Custom encryption and collection support + +--- + +## Installation + +You can install the required packages in one of two ways: + +### Option 1: Install manually via pip + +```bash +pip install crewai vecx google-genai +```` + +> ⚠️ If you want to enable **sparse embeddings (SPLADE)**, also install: + +```bash +pip install transformers torch +``` + +--- + +### Option 2: Install everything from `requirements.txt` + +```bash +pip install -r requirements.txt +``` + +--- + +### `requirements.txt` contents: + +```txt +crewai==0.175.0 +vecx==0.33.1b5 +google-genai==1.32.0 +torch==2.8.0 +transformers==4.45.2 +tokenizers==0.20.3 +numpy==2.2.4 +``` +--- + + +## Usage + +### 1. Import & Configure + +```python +import os +from crewai import Agent, Crew, LLM, Task, Process +from crewai_tools import VectorXVectorSearchTool + +# Initialize the tool +tool = VectorXVectorSearchTool( + api_token=os.getenv("VECTORX_TOKEN"), + collection_name="my_vectorx_collection", + encryption_key=os.getenv("ENCRYPTION_KEY"), + use_sparse=False, # set True to enable hybrid SPLADE search + top_k=3, +) +``` + +--- + +### 2. Store Documents (Example Usage) + +```python +tool.store_documents( + [ + "Python is a versatile programming language.", + "JavaScript is widely used in web development.", + "Rust is known for safety and performance.", + ], + [ + {"category": "language", "name": "Python"}, + {"category": "language", "name": "JavaScript"}, + {"category": "language", "name": "Rust"}, + ], +) +``` + +--- + +### 3. Setup CrewAI Agent + +```python +llm = LLM( + model="gemini/gemini-1.5-flash", + api_key=os.getenv("GEMINI_API_KEY"), +) + +agent = Agent( + role="Vector Search Agent", + goal="Answer user questions using VectorX search", + backstory="You're a helpful assistant that uses VectorX for semantic retrieval.", + llm=llm, + tools=[tool], +) + +task = Task( + description="Answer the user's question using VectorX search. The user asked: {query}", + agent=agent, + expected_output="A concise, relevant answer based on documents.", +) + +crew = Crew(agents=[agent], tasks=[task], process=Process.sequential) +``` + +--- + +### 4. Run a Query (Example Usage) + +```python +if __name__ == "__main__": + question = "Tell me about Python language features." + print(f"\nQuery: {question}") + result = crew.kickoff({"query": question}) + print("\nAnswer:\n", result) +``` + +--- + +## Hybrid Search with SPLADE + +Enable hybrid mode: + +```python +tool = VectorXVectorSearchTool( + api_token=os.getenv("VECTORX_TOKEN"), + collection_name="my_vectorx_collection", + use_sparse=True, # 🔹 enable SPLADE hybrid retrieval +) +``` + +This will combine **dense Gemini embeddings** with **sparse lexical signals** from SPLADE, improving recall on keyword-heavy queries. + +--- + +## Environment Variables (.env) + +| Variable | Description | +| ---------------- | -------------------------------------------- | +| `VECTORX_TOKEN` | API token for your VectorX instance | +| `GEMINI_API_KEY` | Google Gemini API key for embeddings & LLM | +| `ENCRYPTION_KEY` | (Optional) Encryption key for secure storage | +| `GEMINI_MODEL` | (Optional) Gemini embedding model ID. Defaults to models/embedding-001 | +| `SPLADE_MODEL` | (Optional) SPLADE model name from HuggingFace. Defaults to prithivida/Splade_PP_en_v1 | + +--- diff --git a/crewai_tools/tools/vectorx_vector_search_tool/__init__.py b/crewai_tools/tools/vectorx_vector_search_tool/__init__.py new file mode 100644 index 00000000..46b9ac3f --- /dev/null +++ b/crewai_tools/tools/vectorx_vector_search_tool/__init__.py @@ -0,0 +1,11 @@ +from .vectorx_search_tool import ( + VectorXVectorSearchTool, + VectorXSearchArgs, + SpladeSparseEmbedder, +) + +__all__ = [ + "VectorXVectorSearchTool", + "VectorXSearchArgs", + "SpladeSparseEmbedder", +] diff --git a/crewai_tools/tools/vectorx_vector_search_tool/vectorx_search_tool.py b/crewai_tools/tools/vectorx_vector_search_tool/vectorx_search_tool.py new file mode 100644 index 00000000..b1ebfa90 --- /dev/null +++ b/crewai_tools/tools/vectorx_vector_search_tool/vectorx_search_tool.py @@ -0,0 +1,307 @@ +import os +import logging +import uuid +from typing import List, Dict, Callable, Optional, Any, Type + +from pydantic import BaseModel +from crewai.tools import BaseTool + +from copy import deepcopy +from itertools import zip_longest + +# Try importing dependencies with optional installation notes +try: + from vecx.vectorx import VectorX +except ImportError: + raise ImportError("Install vecx package via 'pip install vecx' to use VectorX features.") + +try: + from google import genai # Gemini client +except ImportError: + genai = None + +try: + from transformers import AutoTokenizer, AutoModelForMaskedLM + import torch +except ImportError: + AutoTokenizer = None + AutoModelForMaskedLM = None + torch = None + +_logger = logging.getLogger(__name__) + + +# ---------------- Sparse SPLADE Wrapper ---------------- # +class SpladeSparseEmbedder: + """Wrapper for SPLADE (prithivida/Splade_PP_en_v1) to generate sparse vectors. + + This is used for hybrid search, combining dense and sparse representations. + """ + + def __init__(self, model_name: str = None): + """Initializes the SPLADE model and tokenizer.""" + self.model_name = model_name or os.environ.get("SPLADE_MODEL", "prithivida/Splade_PP_en_v1") + if AutoTokenizer is None or AutoModelForMaskedLM is None: + raise ImportError("transformers not installed. Install with `pip install transformers`") + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self.model = AutoModelForMaskedLM.from_pretrained(self.model_name) + self.model.eval() + self.vocab_size = self.model.config.vocab_size + + def get_vocab_size(self) -> int: + """Returns vocabulary size of the model.""" + return self.vocab_size + + def encode_query(self, text: str, return_sparse: bool = True): + """Encodes a query into sparse format using SPLADE. + + Args: + text: Input query text. + return_sparse: If True, returns indices and values. + + Returns: + A list of sparse vectors with indices and values or raw logits. + """ + inputs = self.tokenizer(text, return_tensors="pt") + with torch.no_grad(): + logits = self.model(**inputs).logits.squeeze(0) # shape: [seq_len, vocab] + max_logits, _ = torch.max(logits, dim=0) # shape: [vocab] + scores = torch.log1p(torch.relu(max_logits)).cpu().numpy() + + nz = scores.nonzero()[0] + values = scores[nz] + + if return_sparse: + return [{"indices": nz.tolist(), "values": values.tolist()}] + return scores + + +# ---------------- CrewAI Tool: VectorX Search ---------------- # +class VectorXSearchArgs(BaseModel): + """Argument schema for VectorX search tool.""" + query: str + top_k: Optional[int] = None + + +class VectorXVectorSearchTool(BaseTool): + """ + CrewAI Tool for semantic search using VectorX vector database. + + Supports both dense (semantic) and sparse (keyword-like via SPLADE) search. + Default embedding model is Gemini via `google-genai`. + + Attributes: + api_token: API token for VectorX. + collection_name: Name of the index/collection in VectorX. + embed_fn: Custom embedding function (optional). + encryption_key: Encryption key for secure collections. + space_type: Vector distance metric (e.g., "cosine"). + use_sparse: Whether to use sparse (SPLADE) embedding. + sparse_embedder: SPLADE embedder instance. + sparse_vocab_size: Vocabulary size for sparse encoder. + top_k: Number of results to retrieve. + """ + + name: str = "VectorXVectorSearchTool" + description: str = ( + "Tool for semantic search using VectorX vector DB " + "with optional sparse embedding support (SPLADE)." + ) + args_schema: Type[BaseModel] = VectorXSearchArgs + + def __init__( + self, + api_token: str, + collection_name: str, + embed_fn: Optional[Callable[[str], List[float]]] = None, + encryption_key: Optional[str] = None, + space_type: str = "cosine", + use_sparse: bool = False, + sparse_embedder: Optional[Any] = None, + sparse_vocab_size: Optional[int] = None, + top_k: int = 3, + gemini_model: Optional[str] = None, + embedding_dim: Optional[int] = None, + ): + """Initializes the VectorX search tool, sets up index and embedding model.""" + super().__init__() + object.__setattr__(self, "api_token", api_token) + object.__setattr__(self, "collection_name", collection_name) + object.__setattr__(self, "encryption_key", encryption_key) + object.__setattr__(self, "space_type", space_type) + object.__setattr__(self, "use_sparse", use_sparse) + object.__setattr__(self, "top_k", top_k) + object.__setattr__(self, "embedding_dim", embedding_dim) + + gemini_model = gemini_model or os.environ.get("GEMINI_MODEL", "models/embedding-001") + _logger.info(f"Using Gemini embedding model: {gemini_model}") + + # Setup sparse encoder + if use_sparse: + if sparse_embedder is None: + sparse_embedder = SpladeSparseEmbedder() + object.__setattr__(self, "sparse_embedder", sparse_embedder) + sparse_vocab_size = sparse_vocab_size or sparse_embedder.get_vocab_size() + else: + object.__setattr__(self, "sparse_embedder", None) + sparse_vocab_size = 0 + object.__setattr__(self, "sparse_vocab_size", sparse_vocab_size) + + # Dense embedding function setup (default: Gemini) + if embed_fn: + object.__setattr__(self, "embed_fn", embed_fn) + else: + if genai is None: + raise ImportError("google-genai not installed. Install with `pip install google-genai`") + + gemini_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) + + def gemini_embed(text: str) -> List[float]: + """Uses Gemini to generate dense embeddings.""" + emb = gemini_client.models.embed_content( + model=gemini_model, + contents=text + ) + vector_obj = emb.embeddings[0].values + vec = [float(v) for v in (vector_obj.values() if isinstance(vector_obj, dict) else vector_obj)] + _logger.debug(f"Gemini embedding len={len(vec)}, sample={vec[:5]}") + return vec + + object.__setattr__(self, "embed_fn", gemini_embed) + + # Setup VectorX index + client = VectorX(token=api_token) + object.__setattr__(self, "client", client) + + # Determine embedding dimension + # dim = len(self.embed_fn("test")) + + # --- CHANGED: Deferring embedding dimension API call + def _get_embedding_dim(): + if self.embedding_dim is not None: + return self.embedding_dim + try: + test_vec = self.embed_fn("test") + object.__setattr__(self, "embedding_dim", len(test_vec)) + except Exception: + _logger.warning("Failed to determine embedding dimension. Defaulting to 768") + object.__setattr__(self, "embedding_dim", 768) + return self.embedding_dim + object.__setattr__(self, "_get_embedding_dim", _get_embedding_dim) + + try: + if use_sparse: + index = client.get_hybrid_index(name=collection_name, key=encryption_key) + else: + index = client.get_index(name=collection_name, key=encryption_key) + except Exception: + _logger.info(f"Creating new index {collection_name}") + if use_sparse: + client.create_hybrid_index( + name=collection_name, + # dimension=dim, + dimension=self._get_embedding_dim(), + space_type=space_type, + vocab_size=sparse_vocab_size, + key=encryption_key, + ) + index = client.get_hybrid_index(name=collection_name, key=encryption_key) + else: + client.create_index( + name=collection_name, + # dimension=dim, + dimension=self._get_embedding_dim(), + space_type=space_type, + key=encryption_key, + ) + index = client.get_index(name=collection_name, key=encryption_key) + + object.__setattr__(self, "index", index) + + def _prepare_sparse_vector(self, text: str) -> Dict[str, Any]: + """Generates sparse representation for given text using SPLADE.""" + sparse_vec = self.sparse_embedder.encode_query(text, return_sparse=True)[0] + return sparse_vec + + def _run(self, query: str, top_k: Optional[int] = None, **kwargs) -> Any: + """Performs a semantic or hybrid search on VectorX. + + Args: + query: The search query. + top_k: Number of top results to return. + + Returns: + A list of search results or error messages. + """ + top_k = top_k or self.top_k + embedding = self.embed_fn(query) + results = [] + + try: + if self.use_sparse: + sparse_vec = self._prepare_sparse_vector(query) + search_results = self.index.search( + dense_vector=embedding, + sparse_vector=sparse_vec, + dense_top_k=top_k, + sparse_top_k=top_k, + filter_query={}, + ) + for r in search_results: + results.append({ + "text": r.get("meta", {}).get("value", ""), + "score": r.get("rrf_score", 0), + "metadata": r.get("meta", {}), + }) + else: + search_results = self.index.query( + vector=embedding, + top_k=top_k, + include_vectors=False, + ) + for r in search_results: + results.append({ + "text": r.get("meta", {}).get("value", ""), + "score": r.get("similarity", 0), + "metadata": r.get("meta", {}), + }) + except Exception as e: + _logger.error(f"VectorX Search Error: {e}") + return [{"error": "Search failed"}] + + return results or [{"message": "No results found"}] + + def store_documents(self, texts: List[str], metadatas: Optional[List[Dict]] = None): + """Stores a list of documents into the VectorX index. + + Args: + texts: List of documents to store. + metadatas: Optional metadata dicts corresponding to each document. + """ + metadatas = metadatas or [{} for _ in texts] + events = [] + + for text, meta in zip_longest(texts, metadatas, fillvalue={}): + meta_copy = deepcopy(meta) + meta_copy["value"] = text + embedding = self.embed_fn(text) + + event = { + "id": str(uuid.uuid4()), + "meta": meta_copy, + } + + if self.use_sparse: + sparse_vec = self._prepare_sparse_vector(text) + event["dense_vector"] = embedding + event["sparse_vector"] = sparse_vec + else: + event["vector"] = embedding + + events.append(event) + + try: + self.index.upsert(events) + except Exception as e: + _logger.error(f"VectorX Upsert Error: {e}") diff --git a/pyproject.toml b/pyproject.toml index 82c73f8e..1cedb5bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -144,6 +144,9 @@ contextual = [ "contextual-client>=0.1.0", "nest-asyncio>=1.6.0", ] +vectorx = [ + "vecx>=0.33.1b5" +] [tool.pytest.ini_options] pythonpath = ["."] diff --git a/tests/tools/test_vectorx_search_tool.py b/tests/tools/test_vectorx_search_tool.py new file mode 100644 index 00000000..0c0f1ba5 --- /dev/null +++ b/tests/tools/test_vectorx_search_tool.py @@ -0,0 +1,105 @@ +import pytest + +# Import VectorX tool class +from crewai_tools import VectorXVectorSearchTool + +# ==== Utility Functions / Dummy Classes ==== + +def dummy_embed(text: str): + """Returns a fixed-size dense embedding for testing.""" + return [0.1] * 128 + +class DummyIndex: + """Simulates index behavior for upsert, query, and hybrid search.""" + def __init__(self): + self.docs = [] + + def upsert(self, events): + self.docs.extend(events) + + def query(self, vector, top_k, include_vectors): + return self.docs[:top_k] + + def search(self, dense_vector, sparse_vector, dense_top_k, sparse_top_k, filter_query): + return self.docs[:dense_top_k] + +class DummyClient: + """Simulates VectorX client with index lifecycle methods.""" + def get_index(self, name, key=None): + return DummyIndex() + + def create_index(self, name, dimension, space_type, key=None): + return True + + def get_hybrid_index(self, name, key=None): + return DummyIndex() + + def create_hybrid_index(self, name, dimension, space_type, vocab_size, key=None): + return True + +class DummySPLADE: + """Simulates SPLADE sparse embedder for hybrid search.""" + def get_vocab_size(self): + return 10 + + def encode_query(self, text, return_sparse=True): + return [{"indices": [0], "values": [1.0]}] + +# ==== Fixtures ==== + +@pytest.fixture +def vx_tool(monkeypatch): + """ + Fixture that provides a VectorXVectorSearchTool with + its VectorX client monkeypatched to the DummyClient. + """ + monkeypatch.setattr( + "crewai_tools.tools.vectorx_vector_search_tool.vectorx_search_tool.VectorX", + lambda token: DummyClient() + ) + return VectorXVectorSearchTool( + api_token="fake-token", + collection_name="test_collection", + embed_fn=dummy_embed, + use_sparse=False + ) + +# ==== Tests ==== + +def test_store_and_search_dense(vx_tool): + """ + Tests dense-only mode: + - Documents are stored via store_documents() + - Search returns results with `text` and `score` fields + """ + vx_tool.store_documents(["doc1", "doc2"], [{"id": "1"}, {"id": "2"}]) + results = vx_tool._run("query") + assert isinstance(results, list) + assert "text" in results[0] + assert results[0]["text"] == "doc1" + assert "score" in results[0] + +def test_hybrid_search(monkeypatch): + """ + Tests hybrid mode (dense + sparse): + - SPLADE embedder is replaced with DummySPLADE + - Documents are stored and search returns expected results + """ + monkeypatch.setattr( + "crewai_tools.tools.vectorx_vector_search_tool.vectorx_search_tool.VectorX", + lambda token: DummyClient() + ) + monkeypatch.setattr( + "crewai_tools.tools.vectorx_vector_search_tool.vectorx_search_tool.SpladeSparseEmbedder", + lambda *args, **kwargs: DummySPLADE() + ) + tool = VectorXVectorSearchTool( + api_token="tok", + collection_name="hybrid_col", + embed_fn=dummy_embed, + use_sparse=True + ) + tool.store_documents(["doc_hybrid"], [{"id": "h1"}]) + results = tool._run("query") + assert isinstance(results, list) + assert results[0]["text"] == "doc_hybrid" \ No newline at end of file