Skip to content

Commit

Permalink
feat: add hierarchical chunker (#18)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Sep 11, 2024
1 parent c482610 commit 9698d30
Show file tree
Hide file tree
Showing 10 changed files with 1,259 additions and 4 deletions.
6 changes: 6 additions & 0 deletions docling_core/transforms/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""Data transformations package."""
15 changes: 15 additions & 0 deletions docling_core/transforms/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""Define the chunker types."""

from docling_core.transforms.chunker.base import ( # noqa
BaseChunker,
Chunk,
ChunkWithMetadata,
)
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
HierarchicalChunker,
)
45 changes: 45 additions & 0 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#
# Copyright IBM Corp. 2024 - 2024
# SPDX-License-Identifier: MIT
#

"""Define base classes for chunking."""
from abc import ABC, abstractmethod
from typing import Iterator, Optional

from pydantic import BaseModel

from docling_core.types import BoundingBox, Document


class Chunk(BaseModel):
"""Data model for Chunk."""

path: str
text: str


class ChunkWithMetadata(Chunk):
"""Data model for Chunk including metadata."""

page: Optional[int]
bbox: Optional[BoundingBox]


class BaseChunker(BaseModel, ABC):
"""Base class for Chunker."""

@abstractmethod
def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
"""Chunk the provided document.
Args:
dl_doc (Document): document to chunk
Raises:
NotImplementedError: in this abstract implementation
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
raise NotImplementedError()
Loading

0 comments on commit 9698d30

Please sign in to comment.