Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
test: add retrieval test
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Aug 27, 2024
1 parent 53f32b9 commit 6378053
Show file tree
Hide file tree
Showing 5 changed files with 224 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/actions/setup-poetry/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ runs:
python-version: ${{ inputs.python-version }}
cache: 'poetry'
- name: Install dependencies
run: poetry install
run: poetry install --all-extras
shell: bash
8 changes: 6 additions & 2 deletions quackling/llama_index/node_parsers/hier_node_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ class HierarchicalNodeParser(NodeParser):
include_metadata: bool = Field(
default=False, description="Whether or not to consider metadata when splitting."
)
id_gen_seed: int | None = None

def _parse_nodes(
self,
nodes: Sequence[BaseNode],
show_progress: bool = False,
id_gen_seed: int | None = None,
**kwargs: Any,
) -> list[BaseNode]:
# based on llama_index.core.node_parser.interface.TextSplitter
Expand All @@ -47,7 +47,11 @@ def _parse_nodes(
excl_meta_embed = NodeMetadata.ExcludedKeys.EMBED
excl_meta_llm = NodeMetadata.ExcludedKeys.LLM

seed = id_gen_seed if id_gen_seed is not None else datetime.now().timestamp()
seed = (
self.id_gen_seed
if self.id_gen_seed is not None
else datetime.now().timestamp()
)
rd = Random()
rd.seed(seed)

Expand Down
166 changes: 166 additions & 0 deletions tests/data/2_out_retrieval_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
{
"root": [
{
"node": {
"id_": "b8db0672-f42d-47cc-80d4-af5974273ca3",
"embedding": null,
"metadata": {
"path": "$.main-text[74]"
},
"excluded_embed_metadata_keys": [
"path"
],
"excluded_llm_metadata_keys": [
"path"
],
"relationships": {
"1": {
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc",
"node_type": "4",
"metadata": {
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc"
},
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36",
"class_name": "RelatedNodeInfo"
},
"2": {
"node_id": "287d06ca-6f4c-469a-8b22-d3081c8eaee9",
"node_type": "1",
"metadata": {
"path": "$.main-text[73]"
},
"hash": "5804beb788b99ed6df0a7f2d99073687280737a08fdad6ecdc51df3bac8894b9",
"class_name": "RelatedNodeInfo"
},
"3": {
"node_id": "f8cda88b-436d-46e2-b83c-fe0be037e5ed",
"node_type": "1",
"metadata": {
"path": "$.main-text[75]"
},
"hash": "6abf99c75ad9eb9d0f6bb81fe93ef6b6b78f4a62351f38cd947ad040d24d0799",
"class_name": "RelatedNodeInfo"
}
},
"text": "4 ANNOTATION CAMPAIGN\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.",
"mimetype": "text/plain",
"start_char_idx": null,
"end_char_idx": null,
"text_template": "{metadata_str}\n\n{content}",
"metadata_template": "{key}: {value}",
"metadata_seperator": "\n",
"class_name": "TextNode"
},
"score": 0.7986578345298767,
"class_name": "NodeWithScore"
},
{
"node": {
"id_": "3139d32c-93cd-49bf-9c94-1cf0dc98d2c1",
"embedding": null,
"metadata": {
"path": "$.main-text[26]"
},
"excluded_embed_metadata_keys": [
"path"
],
"excluded_llm_metadata_keys": [
"path"
],
"relationships": {
"1": {
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc",
"node_type": "4",
"metadata": {
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc"
},
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36",
"class_name": "RelatedNodeInfo"
},
"2": {
"node_id": "e2acf72f-9e57-4f7a-a0ee-89aed453dd32",
"node_type": "1",
"metadata": {
"path": "$.main-text[25]"
},
"hash": "63010af438335217c6e37a6c40e2f2ef771139da74d3b568d7de62cc66b7c152",
"class_name": "RelatedNodeInfo"
},
"3": {
"node_id": "a9488d99-0bbb-4599-91ce-5dd2b45ed1f0",
"node_type": "1",
"metadata": {
"path": "$.main-text[28]"
},
"hash": "21c121b95de8c66a610670b3f309be37da9c2cefbb251ce67a5c0da216833b91",
"class_name": "RelatedNodeInfo"
}
},
"text": "1 INTRODUCTION\n(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.",
"mimetype": "text/plain",
"start_char_idx": null,
"end_char_idx": null,
"text_template": "{metadata_str}\n\n{content}",
"metadata_template": "{key}: {value}",
"metadata_seperator": "\n",
"class_name": "TextNode"
},
"score": 0.7964271903038025,
"class_name": "NodeWithScore"
},
{
"node": {
"id_": "81f631d4-a392-41a7-9777-a4774c66e0a8",
"embedding": null,
"metadata": {
"path": "$.main-text[79]"
},
"excluded_embed_metadata_keys": [
"path"
],
"excluded_llm_metadata_keys": [
"path"
],
"relationships": {
"1": {
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc",
"node_type": "4",
"metadata": {
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc"
},
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36",
"class_name": "RelatedNodeInfo"
},
"2": {
"node_id": "a013ac6e-deda-4e16-9b3d-bd5ce9a1fa6f",
"node_type": "1",
"metadata": {
"path": "$.main-text[78]"
},
"hash": "d838f3136f5b6653b250269e5f12d5a451e5a0181a6cca32ee82d443a8c1d34e",
"class_name": "RelatedNodeInfo"
},
"3": {
"node_id": "5fb8d16c-2720-497d-b2eb-d6899be578c7",
"node_type": "1",
"metadata": {
"path": "$.main-text[81]"
},
"hash": "caba0fb29480faa7c0f219b753040697b1b1c0ba85f9b0fa646441ca52271729",
"class_name": "RelatedNodeInfo"
}
},
"text": "4 ANNOTATION CAMPAIGN\nPhase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted",
"mimetype": "text/plain",
"start_char_idx": null,
"end_char_idx": null,
"text_template": "{metadata_str}\n\n{content}",
"metadata_template": "{key}: {value}",
"metadata_seperator": "\n",
"class_name": "TextNode"
},
"score": 0.7854539752006531,
"class_name": "NodeWithScore"
}
]
}
4 changes: 2 additions & 2 deletions tests/test_li_hierarchical_node_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def test_node_parse():
with open("tests/data/1_inp_li_doc.json") as f:
data_json = f.read()
li_doc = LIDocument.from_json(data_json)
node_parser = HierarchicalNodeParser()
nodes = node_parser._parse_nodes(nodes=[li_doc], id_gen_seed=42)
node_parser = HierarchicalNodeParser(id_gen_seed=42)
nodes = node_parser._parse_nodes(nodes=[li_doc])
act_data = dict(root=[n.dict() for n in nodes])
with open("tests/data/1_out_nodes.json") as f:
exp_data = json.load(fp=f)
Expand Down
49 changes: 49 additions & 0 deletions tests/test_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
from tempfile import TemporaryDirectory

from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.milvus import MilvusVectorStore

from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser
from quackling.llama_index.readers.docling_reader import DoclingReader


def test_retrieval():
FILE_PATH = "https://arxiv.org/pdf/2206.01062" # DocLayNet paper
QUERY = "How many pages were human annotated?"
TOP_K = 3
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
ID_GEN_SEED = 42
MILVUS_DB_FNAME = "milvus_demo.db"
MILVUS_COLL_NAME = "quackling_test_coll"

reader = DoclingReader(parse_type=DoclingReader.ParseType.JSON)
node_parser = HierarchicalNodeParser(id_gen_seed=ID_GEN_SEED)
embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID)

with TemporaryDirectory() as tmp_dir:
vector_store = MilvusVectorStore(
uri=f"{tmp_dir}/{MILVUS_DB_FNAME}",
collection_name=MILVUS_COLL_NAME,
dim=len(embed_model.get_text_embedding("hi")),
overwrite=True,
)
docs = reader.load_data(file_path=[FILE_PATH])
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents=docs,
storage_context=storage_context,
transformations=[node_parser],
embed_model=embed_model,
)
retriever = index.as_retriever(
similarity_top_k=TOP_K,
vector_store_query_mode=VectorStoreQueryMode.DEFAULT,
)
retr_res = retriever.retrieve(QUERY)
act_data = dict(root=[n.dict() for n in retr_res])
with open("tests/data/2_out_retrieval_results.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data

0 comments on commit 6378053

Please sign in to comment.