Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
chore: create a docling reader for docs as JSON.
Browse files Browse the repository at this point in the history
New class DoclingJSONReader from BaseDoclingReader to read docs as JSON.
Unit test for this class

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Sep 5, 2024
1 parent c236a43 commit 3eb333e
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
25 changes: 25 additions & 0 deletions quackling/llama_index/readers/docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import json
from typing import Iterable

from docling_core.types import Document as DLDocument
from llama_index.core.schema import Document as LIDocument

from quackling.llama_index.readers.base import BaseDoclingReader


class DoclingJSONReader(BaseDoclingReader):
def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:

file_paths = file_path if isinstance(file_path, list) else [file_path]

for source in file_paths:
with open(source, encoding="utf-8") as file_obj:
data = json.load(file_obj)
dl_doc: DLDocument = DLDocument.model_validate(data)
li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc)
yield li_doc
17 changes: 17 additions & 0 deletions tests/unit/test_li_docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import os

from quackling.llama_index.readers.docling_json_reader import DoclingJSONReader


def test_lazy_load_data():
reader = DoclingJSONReader(parse_type=DoclingJSONReader.ParseType.JSON)

file_path = "tests/unit/data/0_inp_dl_doc.json"
assert os.path.exists(file_path)
li_docs = list(reader.lazy_load_data(file_path))
assert len(li_docs) == 1

0 comments on commit 3eb333e

Please sign in to comment.