Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.21.13

### Fixes
- **Replace `lazyproperty` with `functools.cached_property`**: Fix a bug where 26 properties returning `None` were re-evaluated on every access instead of caching. Also improves performance on cached reads.

## 0.21.12
- **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.21.12" # pragma: no cover
__version__ = "0.21.13" # pragma: no cover
56 changes: 28 additions & 28 deletions unstructured/chunking/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import collections
import copy
from functools import cached_property
from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast

import regex
Expand All @@ -22,7 +23,6 @@
Title,
)
from unstructured.logger import logger
from unstructured.utils import lazyproperty

# ================================================================================================
# MODEL
Expand Down Expand Up @@ -58,7 +58,7 @@ class TokenCounter:
def __init__(self, tokenizer: str):
self._tokenizer_name = tokenizer

@lazyproperty
@cached_property
def _encoder(self):
"""Lazily initialize the tiktoken encoder."""
import tiktoken
Expand Down Expand Up @@ -143,15 +143,15 @@ def new(cls, **kwargs: Any) -> Self:
self._validate()
return self

@lazyproperty
@cached_property
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.

Overridden by sub-typs to provide semantic-boundary isolation behaviors.
"""
return ()

@lazyproperty
@cached_property
def combine_text_under_n_chars(self) -> int:
"""Combine two consecutive text pre-chunks if first is smaller than this and both will fit.

Expand All @@ -161,7 +161,7 @@ def combine_text_under_n_chars(self) -> int:
arg_value = self._kwargs.get("combine_text_under_n_chars")
return arg_value if arg_value is not None else 0

@lazyproperty
@cached_property
def hard_max(self) -> int:
"""The maximum size for a chunk (in characters or tokens depending on mode).

Expand All @@ -176,7 +176,7 @@ def hard_max(self) -> int:
arg_value = self._kwargs.get("max_characters")
return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT

@lazyproperty
@cached_property
def include_orig_elements(self) -> bool:
"""When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk.

Expand All @@ -185,7 +185,7 @@ def include_orig_elements(self) -> bool:
arg_value = self._kwargs.get("include_orig_elements")
return True if arg_value is None else bool(arg_value)

@lazyproperty
@cached_property
def inter_chunk_overlap(self) -> int:
"""Characters of overlap to add between chunks.

Expand All @@ -195,7 +195,7 @@ def inter_chunk_overlap(self) -> int:
overlap_all_arg = self._kwargs.get("overlap_all")
return self.overlap if overlap_all_arg else 0

@lazyproperty
@cached_property
def overlap(self) -> int:
"""The number of characters to overlap text when splitting chunks mid-text.

Expand All @@ -205,7 +205,7 @@ def overlap(self) -> int:
overlap_arg = self._kwargs.get("overlap")
return overlap_arg or 0

@lazyproperty
@cached_property
def soft_max(self) -> int:
"""A pre-chunk of this size or greater is considered full.

Expand Down Expand Up @@ -237,7 +237,7 @@ def soft_max(self) -> int:
# -- otherwise, give them what they asked for --
return new_after_n_chars_arg

@lazyproperty
@cached_property
def split(self) -> Callable[[str], tuple[str, str]]:
"""A text-splitting function suitable for splitting the text of an oversized pre-chunk.

Expand All @@ -246,7 +246,7 @@ def split(self) -> Callable[[str], tuple[str, str]]:
"""
return _TextSplitter(self)

@lazyproperty
@cached_property
def text_separator(self) -> str:
"""The string to insert between elements when concatenating their text for a chunk.

Expand All @@ -256,7 +256,7 @@ def text_separator(self) -> str:
"""
return "\n\n"

@lazyproperty
@cached_property
def text_splitting_separators(self) -> tuple[str, ...]:
"""Sequence of text-splitting target strings to be used in order of preference."""
text_splitting_separators_arg = self._kwargs.get("text_splitting_separators")
Expand All @@ -266,13 +266,13 @@ def text_splitting_separators(self) -> tuple[str, ...]:
else tuple(text_splitting_separators_arg)
)

@lazyproperty
@cached_property
def token_counter(self) -> TokenCounter | None:
"""The token counter for token-based chunking, or None for character-based chunking."""
tokenizer = self._kwargs.get("tokenizer")
return TokenCounter(tokenizer) if tokenizer else None

@lazyproperty
@cached_property
def use_token_counting(self) -> bool:
"""True when token-based chunking is configured, False for character-based."""
return self._kwargs.get("max_tokens") is not None
Expand Down Expand Up @@ -400,7 +400,7 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]:
# -- processed
yield from pre_chunk_builder.flush()

@lazyproperty
@cached_property
def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks."""
return self._opts.boundary_predicates
Expand Down Expand Up @@ -599,7 +599,7 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]:
else:
yield from _Chunker.iter_chunks(self._elements, self._text, self._opts)

@lazyproperty
@cached_property
def overlap_tail(self) -> str:
"""The portion of this chunk's text to be repeated as a prefix in the next chunk.

Expand Down Expand Up @@ -628,7 +628,7 @@ def _iter_text_segments(self) -> Iterator[str]:
if text:
yield text

@lazyproperty
@cached_property
def _text(self) -> str:
"""The concatenated text of all elements in this pre-chunk, including any overlap.

Expand Down Expand Up @@ -685,7 +685,7 @@ def _iter_chunks(self) -> Iterator[CompositeElement]:
s, remainder = split(remainder)
yield CompositeElement(text=s, metadata=self._continuation_metadata)

@lazyproperty
@cached_property
def _all_metadata_values(self) -> dict[str, list[Any]]:
"""Collection of all populated metadata values across elements.

Expand Down Expand Up @@ -720,7 +720,7 @@ def iter_populated_fields(metadata: ElementMetadata) -> Iterator[tuple[str, Any]

return dict(field_values)

@lazyproperty
@cached_property
def _consolidated_metadata(self) -> ElementMetadata:
"""Metadata applicable to this pre-chunk as a single chunk.

Expand All @@ -736,7 +736,7 @@ def _consolidated_metadata(self) -> ElementMetadata:
consolidated_metadata.orig_elements = self._orig_elements
return consolidated_metadata

@lazyproperty
@cached_property
def _continuation_metadata(self) -> ElementMetadata:
"""Metadata applicable to the second and later text-split chunks of the pre-chunk.

Expand All @@ -750,7 +750,7 @@ def _continuation_metadata(self) -> ElementMetadata:
continuation_metadata.is_continuation = True
return continuation_metadata

@lazyproperty
@cached_property
def _meta_kwargs(self) -> dict[str, Any]:
"""The consolidated metadata values as a dict suitable for constructing ElementMetadata.

Expand Down Expand Up @@ -787,7 +787,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]:

return dict(iter_kwarg_pairs())

@lazyproperty
@cached_property
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk."""

Expand Down Expand Up @@ -858,7 +858,7 @@ def _iter_chunks(self) -> Iterator[Table | TableChunk]:
# -- otherwise, form splits with "synchronized" text and html --
yield from self._iter_text_and_html_table_chunks()

@lazyproperty
@cached_property
def _html(self) -> str:
"""The compactified HTML for this table when it has text-as-HTML.

Expand All @@ -870,7 +870,7 @@ def _html(self) -> str:

return html_table.html

@lazyproperty
@cached_property
def _html_table(self) -> HtmlTable | None:
"""The `lxml` HTML element object for this table.

Expand Down Expand Up @@ -960,7 +960,7 @@ def _metadata(self) -> ElementMetadata:
metadata.orig_elements = self._orig_elements
return metadata

@lazyproperty
@cached_property
def _orig_elements(self) -> list[Element]:
"""The `.metadata.orig_elements` value for chunks formed from this pre-chunk.

Expand All @@ -975,14 +975,14 @@ def _orig_elements(self) -> list[Element]:
orig_table.metadata.orig_elements = None
return [orig_table]

@lazyproperty
@cached_property
def _table_text(self) -> str:
"""The text in this table, not including any overlap-prefix or extra whitespace."""
if not self._table.text:
return ""
return " ".join(self._table.text.split())

@lazyproperty
@cached_property
def _text_with_overlap(self) -> str:
"""The text for this chunk, including the overlap-prefix when present."""
overlap_prefix = self._overlap_prefix
Expand Down Expand Up @@ -1256,7 +1256,7 @@ def _get_token_overlap_tail(self, text: str, target_tokens: int) -> str:

return text[pos:]

@lazyproperty
@cached_property
def _patterns(self) -> tuple[tuple[regex.Pattern[str], int], ...]:
"""Sequence of (pattern, len) pairs to match against.

Expand Down
5 changes: 3 additions & 2 deletions unstructured/chunking/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
import dataclasses as dc
import functools
import inspect
from functools import cached_property
from typing import Any, Callable, Iterable, Optional, Protocol

from typing_extensions import ParamSpec

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import Element
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
from unstructured.utils import get_call_args_applying_defaults

_P = ParamSpec("_P")

Expand Down Expand Up @@ -113,7 +114,7 @@ class _ChunkerSpec:
chunker: Chunker
"""The "chunk_by_{x}() function that implements this chunking strategy."""

@lazyproperty
@cached_property
def kw_arg_names(self) -> tuple[str, ...]:
"""Keyword arguments supported by this chunker.

Expand Down
8 changes: 4 additions & 4 deletions unstructured/chunking/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from __future__ import annotations

from functools import cached_property
from typing import Iterable, Iterator, Optional

from unstructured.chunking.base import (
Expand All @@ -17,7 +18,6 @@
is_title,
)
from unstructured.documents.elements import Element
from unstructured.utils import lazyproperty


def chunk_by_title(
Expand Down Expand Up @@ -124,7 +124,7 @@ class _ByTitleChunkingOptions(ChunkingOptions):
appearing on two different pages can appear in the same chunk.
"""

@lazyproperty
@cached_property
def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
"""The semantic-boundary detectors to be applied to break pre-chunks.

Expand All @@ -140,7 +140,7 @@ def iter_boundary_predicates() -> Iterator[BoundaryPredicate]:

return tuple(iter_boundary_predicates())

@lazyproperty
@cached_property
def combine_text_under_n_chars(self) -> int:
"""Combine consecutive text pre-chunks if former is smaller than this and both will fit.

Expand All @@ -152,7 +152,7 @@ def combine_text_under_n_chars(self) -> int:
arg_value = self._kwargs.get("combine_text_under_n_chars")
return self.hard_max if arg_value is None else arg_value

@lazyproperty
@cached_property
def multipage_sections(self) -> bool:
"""When False, break pre-chunks on page-boundaries."""
arg_value = self._kwargs.get("multipage_sections")
Expand Down
Loading
Loading