Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
feat: create a docling reader for docs as JSON.
Browse files Browse the repository at this point in the history
New class DoclingJSONReader from BaseDoclingReader to read docs as JSON.
Unit test for this class

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam committed Sep 5, 2024
1 parent c236a43 commit b32d107
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 0 deletions.
25 changes: 25 additions & 0 deletions quackling/llama_index/readers/docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import json
from typing import Iterable

from docling_core.types import Document as DLDocument
from llama_index.core.schema import Document as LIDocument

from quackling.llama_index.readers.base import BaseDoclingReader


class DoclingJSONReader(BaseDoclingReader):
def lazy_load_data(self, file_path: str | list[str]) -> Iterable[LIDocument]:

file_paths = file_path if isinstance(file_path, list) else [file_path]

for source in file_paths:
with open(source, encoding="utf-8") as file_obj:
data = json.load(file_obj)
dl_doc: DLDocument = DLDocument.model_validate(data)
li_doc: LIDocument = self._create_li_doc_from_dl_doc(dl_doc=dl_doc)
yield li_doc
16 changes: 16 additions & 0 deletions tests/unit/test_li_docling_json_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

import os

from quackling.llama_index.readers.docling_json_reader import DoclingJSONReader


def test_lazy_load_data():
reader = DoclingJSONReader(parse_type=DoclingJSONReader.ParseType.JSON)

file_path = "tests/unit/data/0_inp_dl_doc.json"
li_docs = list(reader.lazy_load_data(file_path))
assert len(li_docs) == 1

0 comments on commit b32d107

Please sign in to comment.