From 0ce1b4c96e619b9e7fe836d64e3e73ac70a53f4e Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 5 Sep 2024 13:07:44 +0200 Subject: [PATCH] feat: create a docling reader for docs as JSON. (#10) New class DoclingJSONReader from BaseDoclingReader to read docs as JSON. Unit test for this class. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- quackling/llama_index/readers/__init__.py | 1 + .../readers/docling_json_reader.py | 25 +++++++++++++++++++ tests/unit/test_li_docling_json_reader.py | 14 +++++++++++ 3 files changed, 40 insertions(+) create mode 100644 quackling/llama_index/readers/docling_json_reader.py create mode 100644 tests/unit/test_li_docling_json_reader.py diff --git a/quackling/llama_index/readers/__init__.py b/quackling/llama_index/readers/__init__.py index c7803b8..99a6af7 100644 --- a/quackling/llama_index/readers/__init__.py +++ b/quackling/llama_index/readers/__init__.py @@ -3,4 +3,5 @@ # SPDX-License-Identifier: MIT # +from quackling.llama_index.readers.docling_json_reader import DoclingJSONReader # noqa from quackling.llama_index.readers.docling_pdf_reader import DoclingPDFReader # noqa diff --git a/quackling/llama_index/readers/docling_json_reader.py b/quackling/llama_index/readers/docling_json_reader.py new file mode 100644 index 0000000..1d04aaa --- /dev/null +++ b/quackling/llama_index/readers/docling_json_reader.py @@ -0,0 +1,25 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +import json +from typing import Iterable + +from docling_core.types import Document as DLDocument +from llama_index.core.schema import Document as LIDocument + +from quackling.llama_index.readers.base import BaseDoclingReader + + +class DoclingJSONReader(BaseDoclingReader): + def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]: + + file_paths = file_path if isinstance(file_path, list) else [file_path] + + for source in file_paths: + with open(source, encoding="utf-8") as file_obj: + data = json.load(file_obj) + dl_doc: DLDocument = DLDocument.model_validate(data) + li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc) + yield li_doc diff --git a/tests/unit/test_li_docling_json_reader.py b/tests/unit/test_li_docling_json_reader.py new file mode 100644 index 0000000..96c13fd --- /dev/null +++ b/tests/unit/test_li_docling_json_reader.py @@ -0,0 +1,14 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +from quackling.llama_index.readers import DoclingJSONReader + + +def test_lazy_load_data(): + reader = DoclingJSONReader(parse_type=DoclingJSONReader.ParseType.JSON) + + file_path = "tests/unit/data/0_inp_dl_doc.json" + li_docs = list(reader.lazy_load_data(file_path)) + assert len(li_docs) == 1