This repository has been archived by the owner on Sep 11, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Panos Vagenas <[email protected]>
- Loading branch information
Showing
5 changed files
with
224 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
{ | ||
"root": [ | ||
{ | ||
"node": { | ||
"id_": "b8db0672-f42d-47cc-80d4-af5974273ca3", | ||
"embedding": null, | ||
"metadata": { | ||
"path": "$.main-text[74]" | ||
}, | ||
"excluded_embed_metadata_keys": [ | ||
"path" | ||
], | ||
"excluded_llm_metadata_keys": [ | ||
"path" | ||
], | ||
"relationships": { | ||
"1": { | ||
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc", | ||
"node_type": "4", | ||
"metadata": { | ||
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc" | ||
}, | ||
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"2": { | ||
"node_id": "287d06ca-6f4c-469a-8b22-d3081c8eaee9", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[73]" | ||
}, | ||
"hash": "5804beb788b99ed6df0a7f2d99073687280737a08fdad6ecdc51df3bac8894b9", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"3": { | ||
"node_id": "f8cda88b-436d-46e2-b83c-fe0be037e5ed", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[75]" | ||
}, | ||
"hash": "6abf99c75ad9eb9d0f6bb81fe93ef6b6b78f4a62351f38cd947ad040d24d0799", | ||
"class_name": "RelatedNodeInfo" | ||
} | ||
}, | ||
"text": "4 ANNOTATION CAMPAIGN\nThe complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.", | ||
"mimetype": "text/plain", | ||
"start_char_idx": null, | ||
"end_char_idx": null, | ||
"text_template": "{metadata_str}\n\n{content}", | ||
"metadata_template": "{key}: {value}", | ||
"metadata_seperator": "\n", | ||
"class_name": "TextNode" | ||
}, | ||
"score": 0.7986578345298767, | ||
"class_name": "NodeWithScore" | ||
}, | ||
{ | ||
"node": { | ||
"id_": "3139d32c-93cd-49bf-9c94-1cf0dc98d2c1", | ||
"embedding": null, | ||
"metadata": { | ||
"path": "$.main-text[26]" | ||
}, | ||
"excluded_embed_metadata_keys": [ | ||
"path" | ||
], | ||
"excluded_llm_metadata_keys": [ | ||
"path" | ||
], | ||
"relationships": { | ||
"1": { | ||
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc", | ||
"node_type": "4", | ||
"metadata": { | ||
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc" | ||
}, | ||
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"2": { | ||
"node_id": "e2acf72f-9e57-4f7a-a0ee-89aed453dd32", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[25]" | ||
}, | ||
"hash": "63010af438335217c6e37a6c40e2f2ef771139da74d3b568d7de62cc66b7c152", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"3": { | ||
"node_id": "a9488d99-0bbb-4599-91ce-5dd2b45ed1f0", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[28]" | ||
}, | ||
"hash": "21c121b95de8c66a610670b3f309be37da9c2cefbb251ce67a5c0da216833b91", | ||
"class_name": "RelatedNodeInfo" | ||
} | ||
}, | ||
"text": "1 INTRODUCTION\n(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.", | ||
"mimetype": "text/plain", | ||
"start_char_idx": null, | ||
"end_char_idx": null, | ||
"text_template": "{metadata_str}\n\n{content}", | ||
"metadata_template": "{key}: {value}", | ||
"metadata_seperator": "\n", | ||
"class_name": "TextNode" | ||
}, | ||
"score": 0.7964271903038025, | ||
"class_name": "NodeWithScore" | ||
}, | ||
{ | ||
"node": { | ||
"id_": "81f631d4-a392-41a7-9777-a4774c66e0a8", | ||
"embedding": null, | ||
"metadata": { | ||
"path": "$.main-text[79]" | ||
}, | ||
"excluded_embed_metadata_keys": [ | ||
"path" | ||
], | ||
"excluded_llm_metadata_keys": [ | ||
"path" | ||
], | ||
"relationships": { | ||
"1": { | ||
"node_id": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc", | ||
"node_type": "4", | ||
"metadata": { | ||
"dl_doc_hash": "5dfbd8c115a15fd3396b68409124cfee29fc8efac7b5c846634ff924e635e0dc" | ||
}, | ||
"hash": "1b7dc86d6f19efda934a54e72a0743d166f8d085a9089715c8b6336289ea9b36", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"2": { | ||
"node_id": "a013ac6e-deda-4e16-9b3d-bd5ce9a1fa6f", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[78]" | ||
}, | ||
"hash": "d838f3136f5b6653b250269e5f12d5a451e5a0181a6cca32ee82d443a8c1d34e", | ||
"class_name": "RelatedNodeInfo" | ||
}, | ||
"3": { | ||
"node_id": "5fb8d16c-2720-497d-b2eb-d6899be578c7", | ||
"node_type": "1", | ||
"metadata": { | ||
"path": "$.main-text[81]" | ||
}, | ||
"hash": "caba0fb29480faa7c0f219b753040697b1b1c0ba85f9b0fa646441ca52271729", | ||
"class_name": "RelatedNodeInfo" | ||
} | ||
}, | ||
"text": "4 ANNOTATION CAMPAIGN\nPhase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted", | ||
"mimetype": "text/plain", | ||
"start_char_idx": null, | ||
"end_char_idx": null, | ||
"text_template": "{metadata_str}\n\n{content}", | ||
"metadata_template": "{key}: {value}", | ||
"metadata_seperator": "\n", | ||
"class_name": "TextNode" | ||
}, | ||
"score": 0.7854539752006531, | ||
"class_name": "NodeWithScore" | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import json | ||
from tempfile import TemporaryDirectory | ||
|
||
from llama_index.core import StorageContext, VectorStoreIndex | ||
from llama_index.core.vector_stores.types import VectorStoreQueryMode | ||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding | ||
from llama_index.vector_stores.milvus import MilvusVectorStore | ||
|
||
from quackling.llama_index.node_parsers.hier_node_parser import HierarchicalNodeParser | ||
from quackling.llama_index.readers.docling_reader import DoclingReader | ||
|
||
|
||
def test_retrieval(): | ||
FILE_PATH = "https://arxiv.org/pdf/2206.01062" # DocLayNet paper | ||
QUERY = "How many pages were human annotated?" | ||
TOP_K = 3 | ||
HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5" | ||
ID_GEN_SEED = 42 | ||
MILVUS_DB_FNAME = "milvus_demo.db" | ||
MILVUS_COLL_NAME = "quackling_test_coll" | ||
|
||
reader = DoclingReader(parse_type=DoclingReader.ParseType.JSON) | ||
node_parser = HierarchicalNodeParser(id_gen_seed=ID_GEN_SEED) | ||
embed_model = HuggingFaceEmbedding(model_name=HF_EMBED_MODEL_ID) | ||
|
||
with TemporaryDirectory() as tmp_dir: | ||
vector_store = MilvusVectorStore( | ||
uri=f"{tmp_dir}/{MILVUS_DB_FNAME}", | ||
collection_name=MILVUS_COLL_NAME, | ||
dim=len(embed_model.get_text_embedding("hi")), | ||
overwrite=True, | ||
) | ||
docs = reader.load_data(file_path=[FILE_PATH]) | ||
storage_context = StorageContext.from_defaults(vector_store=vector_store) | ||
index = VectorStoreIndex.from_documents( | ||
documents=docs, | ||
storage_context=storage_context, | ||
transformations=[node_parser], | ||
embed_model=embed_model, | ||
) | ||
retriever = index.as_retriever( | ||
similarity_top_k=TOP_K, | ||
vector_store_query_mode=VectorStoreQueryMode.DEFAULT, | ||
) | ||
retr_res = retriever.retrieve(QUERY) | ||
act_data = dict(root=[n.dict() for n in retr_res]) | ||
with open("tests/data/2_out_retrieval_results.json") as f: | ||
exp_data = json.load(fp=f) | ||
assert exp_data == act_data |