Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
feat: create a docling reader for docs as JSON. (#10)
Browse files Browse the repository at this point in the history
New class DoclingJSONReader from BaseDoclingReader to read docs as JSON.
Unit test for this class.

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam authored Sep 5, 2024
1 parent c236a43 commit 0ce1b4c
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 0 deletions.
1 change: 1 addition & 0 deletions quackling/llama_index/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers.docling_json_reader import DoclingJSONReader # noqa
from quackling.llama_index.readers.docling_pdf_reader import DoclingPDFReader # noqa
25 changes: 25 additions & 0 deletions quackling/llama_index/readers/docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import json
from typing import Iterable

from docling_core.types import Document as DLDocument
from llama_index.core.schema import Document as LIDocument

from quackling.llama_index.readers.base import BaseDoclingReader


class DoclingJSONReader(BaseDoclingReader):
def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:

file_paths = file_path if isinstance(file_path, list) else [file_path]

for source in file_paths:
with open(source, encoding="utf-8") as file_obj:
data = json.load(file_obj)
dl_doc: DLDocument = DLDocument.model_validate(data)
li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc)
yield li_doc
14 changes: 14 additions & 0 deletions tests/unit/test_li_docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

from quackling.llama_index.readers import DoclingJSONReader


def test_lazy_load_data():
reader = DoclingJSONReader(parse_type=DoclingJSONReader.ParseType.JSON)

file_path = "tests/unit/data/0_inp_dl_doc.json"
li_docs = list(reader.lazy_load_data(file_path))
assert len(li_docs) == 1

0 comments on commit 0ce1b4c

Please sign in to comment.