diff --git a/quackling/llama_index/readers/docling_reader.py b/quackling/llama_index/readers/docling_reader.py index 88c34e5..d5c0b1b 100644 --- a/quackling/llama_index/readers/docling_reader.py +++ b/quackling/llama_index/readers/docling_reader.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: MIT # +import json from enum import Enum from typing import Iterable @@ -49,3 +50,14 @@ def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]: dl_doc = converter.convert_single(source).output li_doc = self._create_li_doc_from_dl_doc(dl_doc=dl_doc) yield li_doc + + def lazy_load_docs(self, file_path: str | list[str]) -> Iterable[LIDocument]: + + file_paths = file_path if isinstance(file_path, list) else [file_path] + + for source in file_paths: + with open(source, encoding="utf-8") as file_obj: + data = json.load(file_obj) + dl_doc: DLDocument = DLDocument.model_validate(data) + li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc) + yield li_doc diff --git a/tests/unit/test_li_docling_reader.py b/tests/unit/test_li_docling_reader.py new file mode 100644 index 0000000..987eb3a --- /dev/null +++ b/tests/unit/test_li_docling_reader.py @@ -0,0 +1,17 @@ +# +# Copyright IBM Corp. 2024 - 2024 +# SPDX-License-Identifier: MIT +# + +import os + +from quackling.llama_index.readers.docling_reader import DoclingReader + + +def test_lazy_load_docs(): + reader = DoclingReader(parse_type=DoclingReader.ParseType.JSON) + + file_path = "tests/unit/data/0_inp_dl_doc.json" + assert os.path.exists(file_path) + li_docs = list(reader.lazy_load_docs(file_path)) + assert len(li_docs) == 1