Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
refactor: refactor LlamaIndex extensions
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Sep 4, 2024
1 parent 6acb898 commit 106cff0
Show file tree
Hide file tree
Showing 18 changed files with 801 additions and 601 deletions.
35 changes: 19 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,25 +59,26 @@ import os
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
from quackling.llama_index.readers.docling_reader import DoclingReader
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
from quackling.llama_index.readers import DoclingPDFReader

DOCS = ["https://arxiv.org/pdf/2311.18481"]
QUERY = "What is DocQA?"
DOCS = ["https://arxiv.org/pdf/2206.01062"]
QUESTION = "How many pages were human annotated?"
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
LLM = HuggingFaceInferenceAPI(
token=os.getenv("HF_TOKEN"),
model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

index = VectorStoreIndex.from_documents(
documents=DoclingReader(parse_type=DoclingReader.ParseType.JSON).load_data(DOCS),
documents=DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON).load_data(DOCS),
embed_model=EMBED_MODEL,
transformations=[HierarchicalNodeParser()],
transformations=[HierarchicalJSONNodeParser()],
)
query_engine = index.as_query_engine(llm=LLM)
response = query_engine.query(QUERY)
# > DocQA is a question-answering conversational assistant [...]
result = query_engine.query(QUESTION)
print(result.response)
# > 80K pages were human annotated
```

### Chunking
Expand All @@ -88,7 +89,7 @@ to Docling document's nodes:

```python
from docling.document_converter import DocumentConverter
from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from quackling.core.chunkers import HierarchicalChunker

doc = DocumentConverter().convert_single("https://arxiv.org/pdf/2408.09869").output
chunks = list(HierarchicalChunker().chunk(doc))
Expand Down Expand Up @@ -120,13 +121,15 @@ Please read [Contributing to Quackling](./CONTRIBUTING.md) for details.
If you use Quackling in your projects, please consider citing the following:

```bib
@software{Docling,
author = {Deep Search Team},
month = {7},
title = {{Docling}},
url = {https://github.com/DS4SD/docling},
version = {main},
year = {2024}
@techreport{Docling,
author = "Deep Search Team",
month = 8,
title = "Docling Technical Report",
url = "https://arxiv.org/abs/2408.09869",
eprint = "2408.09869",
doi = "10.48550/arXiv.2408.09869",
version = "1.0.0",
year = 2024
}
```

Expand Down
102 changes: 60 additions & 42 deletions examples/basic_pipeline.ipynb

Large diffs are not rendered by default.

214 changes: 89 additions & 125 deletions examples/hybrid_pipeline.ipynb

Large diffs are not rendered by default.

183 changes: 81 additions & 102 deletions examples/native_nodes.ipynb

Large diffs are not rendered by default.

237 changes: 112 additions & 125 deletions examples/node_transformations.ipynb

Large diffs are not rendered by default.

487 changes: 336 additions & 151 deletions examples/prev_next_augmentation.ipynb

Large diffs are not rendered by default.

33 changes: 31 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ llama-index-llms-huggingface-api = { version = "^0.2.0", optional = true }
llama-index-vector-stores-milvus = { version = "^0.2.1", optional = true }
llama-index-postprocessor-flag-embedding-reranker = {version = "^0.2.0", optional = true }
flagembedding = { version = "^1.2.10", optional = true }
peft = { version = "^0.12.0", optional = true } # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
jsonpath-ng = { version = "^1.6.1", optional = true }

##############
Expand All @@ -69,6 +70,7 @@ examples = [
"llama-index-vector-stores-milvus",
"llama-index-postprocessor-flag-embedding-reranker",
"flagembedding",
"peft", # TODO: remove once we can update FlagEmbedding past 1.2.11 to include https://github.com/FlagOpen/FlagEmbedding/commit/1613625417e293bf98311cb8ae0819a0a3af5297
"jsonpath-ng",
]

Expand Down
2 changes: 2 additions & 0 deletions quackling/core/chunkers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker # noqa
4 changes: 4 additions & 0 deletions quackling/llama_index/node_parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.node_parsers.hier_node_parser import ( # noqa
HierarchicalJSONNodeParser,
)
10 changes: 9 additions & 1 deletion quackling/llama_index/node_parsers/hier_node_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,15 @@
)
from llama_index.core.utils import get_tqdm_iterable
from pydantic import Field
from typing_extensions import deprecated

from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from quackling.core.chunkers import HierarchicalChunker
from quackling.llama_index.node_parsers.base import NodeMetadata


@deprecated(
"Use `quackling.llama_index.node_parsers.HierarchicalJSONNodeParser` instead."
)
class HierarchicalNodeParser(NodeParser):

# override default to False to avoid inheriting source doc's metadata
Expand Down Expand Up @@ -79,3 +83,7 @@ def _parse_nodes(
).model_dump()
all_nodes.append(node)
return all_nodes


class HierarchicalJSONNodeParser(HierarchicalNodeParser):
pass
2 changes: 2 additions & 0 deletions quackling/llama_index/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers.docling_pdf_reader import DoclingPDFReader # noqa
32 changes: 32 additions & 0 deletions quackling/llama_index/readers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
# SPDX-License-Identifier: MIT
#

from enum import Enum

from docling_core.types import Document as DLDocument
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document as LIDocument
from pydantic import BaseModel


Expand All @@ -16,3 +21,30 @@ class ExcludedKeys:

dl_doc_hash: str
# source: str


class BaseDoclingReader(BasePydanticReader):
class ParseType(str, Enum):
MARKDOWN = "markdown"
JSON = "json"

parse_type: ParseType = ParseType.MARKDOWN

def _create_li_doc_from_dl_doc(self, dl_doc: DLDocument) -> LIDocument:
if self.parse_type == self.ParseType.MARKDOWN:
text = dl_doc.export_to_markdown()
elif self.parse_type == self.ParseType.JSON:
text = dl_doc.model_dump_json()
else:
raise RuntimeError(f"Unexpected parse type encountered: {self.parse_type}")

li_doc = LIDocument(
doc_id=dl_doc.file_info.document_hash,
text=text,
excluded_embed_metadata_keys=DocumentMetadata.ExcludedKeys.EMBED,
excluded_llm_metadata_keys=DocumentMetadata.ExcludedKeys.LLM,
)
li_doc.metadata = DocumentMetadata(
dl_doc_hash=dl_doc.file_info.document_hash,
).model_dump()
return li_doc
10 changes: 10 additions & 0 deletions quackling/llama_index/readers/docling_pdf_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers.docling_reader import DoclingReader


class DoclingPDFReader(DoclingReader):
pass
34 changes: 4 additions & 30 deletions quackling/llama_index/readers/docling_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,43 +3,17 @@
# SPDX-License-Identifier: MIT
#

from enum import Enum
from typing import Iterable

from docling.document_converter import DocumentConverter
from docling_core.types import Document as DLDocument
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document as LIDocument
from typing_extensions import deprecated

from quackling.llama_index.readers.base import DocumentMetadata
from quackling.llama_index.readers.base import BaseDoclingReader


class DoclingReader(BasePydanticReader):
class ParseType(str, Enum):
MARKDOWN = "markdown"
JSON = "json"

parse_type: ParseType = ParseType.MARKDOWN

def _create_li_doc_from_dl_doc(self, dl_doc: DLDocument) -> LIDocument:
if self.parse_type == self.ParseType.MARKDOWN:
text = dl_doc.export_to_markdown()
elif self.parse_type == self.ParseType.JSON:
text = dl_doc.model_dump_json()
else:
raise RuntimeError(f"Unexpected parse type encountered: {self.parse_type}")

li_doc = LIDocument(
doc_id=dl_doc.file_info.document_hash,
text=text,
excluded_embed_metadata_keys=DocumentMetadata.ExcludedKeys.EMBED,
excluded_llm_metadata_keys=DocumentMetadata.ExcludedKeys.LLM,
)
li_doc.metadata = DocumentMetadata(
dl_doc_hash=dl_doc.file_info.document_hash,
).model_dump()
return li_doc

@deprecated("Use `quackling.llama_index.readers.DoclingPDFReader` instead.")
class DoclingReader(BaseDoclingReader):
def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:

file_paths = file_path if isinstance(file_path, list) else [file_path]
Expand Down
9 changes: 5 additions & 4 deletions tests/integration/test_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.milvus import MilvusVectorStore

from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
from quackling.llama_index.readers.docling_reader import DoclingReader
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
from quackling.llama_index.readers import DoclingPDFReader


def test_retrieval():
Expand All @@ -19,8 +19,9 @@ def test_retrieval():
MILVUS_DB_FNAME = "milvus_demo.db"
MILVUS_COLL_NAME = "quackling_test_coll"

reader = DoclingReader(parse_type=DoclingReader.ParseType.JSON)
node_parser = HierarchicalNodeParser(id_gen_seed=ID_GEN_SEED)
reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)
node_parser = HierarchicalJSONNodeParser(id_gen_seed=ID_GEN_SEED)

embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID)

with TemporaryDirectory() as tmp_dir:
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from docling_core.types import Document as DLDocument

from quackling.core.chunkers.hierarchical_chunker import HierarchicalChunker
from quackling.core.chunkers import HierarchicalChunker


def test_chunk_without_metadata():
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_li_hierarchical_node_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

from llama_index.core.schema import Document as LIDocument

from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser


def test_node_parse():
with open("tests/unit/data/1_inp_li_doc.json") as f:
data_json = f.read()
li_doc = LIDocument.from_json(data_json)
node_parser = HierarchicalNodeParser(id_gen_seed=42)
node_parser = HierarchicalJSONNodeParser(id_gen_seed=42)
nodes = node_parser._parse_nodes(nodes=[li_doc])
act_data = dict(root=[n.dict() for n in nodes])
with open("tests/unit/data/1_out_nodes.json") as f:
Expand Down

0 comments on commit 106cff0

Please sign in to comment.