diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c09b47e70..ac18860338 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.21.13 + +### Fixes +- **Replace `lazyproperty` with `functools.cached_property`**: Fix a bug where 26 properties returning `None` were re-evaluated on every access instead of caching. Also improves performance on cached reads. + ## 0.21.12 - **Add Check for complex documents**: Adds a check for complex documents to avoid pdfminer with a high ratio of vector objects diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 1a6f7540fc..8e21a799c3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.21.12" # pragma: no cover +__version__ = "0.21.13" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 5fe58093a0..6ebc3e35b1 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -4,6 +4,7 @@ import collections import copy +from functools import cached_property from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast import regex @@ -22,7 +23,6 @@ Title, ) from unstructured.logger import logger -from unstructured.utils import lazyproperty # ================================================================================================ # MODEL @@ -58,7 +58,7 @@ class TokenCounter: def __init__(self, tokenizer: str): self._tokenizer_name = tokenizer - @lazyproperty + @cached_property def _encoder(self): """Lazily initialize the tiktoken encoder.""" import tiktoken @@ -143,7 +143,7 @@ def new(cls, **kwargs: Any) -> Self: self._validate() return self - @lazyproperty + @cached_property def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """The semantic-boundary detectors to be applied to break pre-chunks. @@ -151,7 +151,7 @@ def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """ return () - @lazyproperty + @cached_property def combine_text_under_n_chars(self) -> int: """Combine two consecutive text pre-chunks if first is smaller than this and both will fit. @@ -161,7 +161,7 @@ def combine_text_under_n_chars(self) -> int: arg_value = self._kwargs.get("combine_text_under_n_chars") return arg_value if arg_value is not None else 0 - @lazyproperty + @cached_property def hard_max(self) -> int: """The maximum size for a chunk (in characters or tokens depending on mode). @@ -176,7 +176,7 @@ def hard_max(self) -> int: arg_value = self._kwargs.get("max_characters") return arg_value if arg_value is not None else CHUNK_MAX_CHARS_DEFAULT - @lazyproperty + @cached_property def include_orig_elements(self) -> bool: """When True, add original elements from pre-chunk to `.metadata.orig_elements` of chunk. @@ -185,7 +185,7 @@ def include_orig_elements(self) -> bool: arg_value = self._kwargs.get("include_orig_elements") return True if arg_value is None else bool(arg_value) - @lazyproperty + @cached_property def inter_chunk_overlap(self) -> int: """Characters of overlap to add between chunks. @@ -195,7 +195,7 @@ def inter_chunk_overlap(self) -> int: overlap_all_arg = self._kwargs.get("overlap_all") return self.overlap if overlap_all_arg else 0 - @lazyproperty + @cached_property def overlap(self) -> int: """The number of characters to overlap text when splitting chunks mid-text. @@ -205,7 +205,7 @@ def overlap(self) -> int: overlap_arg = self._kwargs.get("overlap") return overlap_arg or 0 - @lazyproperty + @cached_property def soft_max(self) -> int: """A pre-chunk of this size or greater is considered full. @@ -237,7 +237,7 @@ def soft_max(self) -> int: # -- otherwise, give them what they asked for -- return new_after_n_chars_arg - @lazyproperty + @cached_property def split(self) -> Callable[[str], tuple[str, str]]: """A text-splitting function suitable for splitting the text of an oversized pre-chunk. @@ -246,7 +246,7 @@ def split(self) -> Callable[[str], tuple[str, str]]: """ return _TextSplitter(self) - @lazyproperty + @cached_property def text_separator(self) -> str: """The string to insert between elements when concatenating their text for a chunk. @@ -256,7 +256,7 @@ def text_separator(self) -> str: """ return "\n\n" - @lazyproperty + @cached_property def text_splitting_separators(self) -> tuple[str, ...]: """Sequence of text-splitting target strings to be used in order of preference.""" text_splitting_separators_arg = self._kwargs.get("text_splitting_separators") @@ -266,13 +266,13 @@ def text_splitting_separators(self) -> tuple[str, ...]: else tuple(text_splitting_separators_arg) ) - @lazyproperty + @cached_property def token_counter(self) -> TokenCounter | None: """The token counter for token-based chunking, or None for character-based chunking.""" tokenizer = self._kwargs.get("tokenizer") return TokenCounter(tokenizer) if tokenizer else None - @lazyproperty + @cached_property def use_token_counting(self) -> bool: """True when token-based chunking is configured, False for character-based.""" return self._kwargs.get("max_tokens") is not None @@ -400,7 +400,7 @@ def _iter_pre_chunks(self) -> Iterator[PreChunk]: # -- processed yield from pre_chunk_builder.flush() - @lazyproperty + @cached_property def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """The semantic-boundary detectors to be applied to break pre-chunks.""" return self._opts.boundary_predicates @@ -599,7 +599,7 @@ def iter_chunks(self) -> Iterator[CompositeElement | Table | TableChunk]: else: yield from _Chunker.iter_chunks(self._elements, self._text, self._opts) - @lazyproperty + @cached_property def overlap_tail(self) -> str: """The portion of this chunk's text to be repeated as a prefix in the next chunk. @@ -628,7 +628,7 @@ def _iter_text_segments(self) -> Iterator[str]: if text: yield text - @lazyproperty + @cached_property def _text(self) -> str: """The concatenated text of all elements in this pre-chunk, including any overlap. @@ -685,7 +685,7 @@ def _iter_chunks(self) -> Iterator[CompositeElement]: s, remainder = split(remainder) yield CompositeElement(text=s, metadata=self._continuation_metadata) - @lazyproperty + @cached_property def _all_metadata_values(self) -> dict[str, list[Any]]: """Collection of all populated metadata values across elements. @@ -720,7 +720,7 @@ def iter_populated_fields(metadata: ElementMetadata) -> Iterator[tuple[str, Any] return dict(field_values) - @lazyproperty + @cached_property def _consolidated_metadata(self) -> ElementMetadata: """Metadata applicable to this pre-chunk as a single chunk. @@ -736,7 +736,7 @@ def _consolidated_metadata(self) -> ElementMetadata: consolidated_metadata.orig_elements = self._orig_elements return consolidated_metadata - @lazyproperty + @cached_property def _continuation_metadata(self) -> ElementMetadata: """Metadata applicable to the second and later text-split chunks of the pre-chunk. @@ -750,7 +750,7 @@ def _continuation_metadata(self) -> ElementMetadata: continuation_metadata.is_continuation = True return continuation_metadata - @lazyproperty + @cached_property def _meta_kwargs(self) -> dict[str, Any]: """The consolidated metadata values as a dict suitable for constructing ElementMetadata. @@ -787,7 +787,7 @@ def iter_kwarg_pairs() -> Iterator[tuple[str, Any]]: return dict(iter_kwarg_pairs()) - @lazyproperty + @cached_property def _orig_elements(self) -> list[Element]: """The `.metadata.orig_elements` value for chunks formed from this pre-chunk.""" @@ -858,7 +858,7 @@ def _iter_chunks(self) -> Iterator[Table | TableChunk]: # -- otherwise, form splits with "synchronized" text and html -- yield from self._iter_text_and_html_table_chunks() - @lazyproperty + @cached_property def _html(self) -> str: """The compactified HTML for this table when it has text-as-HTML. @@ -870,7 +870,7 @@ def _html(self) -> str: return html_table.html - @lazyproperty + @cached_property def _html_table(self) -> HtmlTable | None: """The `lxml` HTML element object for this table. @@ -960,7 +960,7 @@ def _metadata(self) -> ElementMetadata: metadata.orig_elements = self._orig_elements return metadata - @lazyproperty + @cached_property def _orig_elements(self) -> list[Element]: """The `.metadata.orig_elements` value for chunks formed from this pre-chunk. @@ -975,14 +975,14 @@ def _orig_elements(self) -> list[Element]: orig_table.metadata.orig_elements = None return [orig_table] - @lazyproperty + @cached_property def _table_text(self) -> str: """The text in this table, not including any overlap-prefix or extra whitespace.""" if not self._table.text: return "" return " ".join(self._table.text.split()) - @lazyproperty + @cached_property def _text_with_overlap(self) -> str: """The text for this chunk, including the overlap-prefix when present.""" overlap_prefix = self._overlap_prefix @@ -1256,7 +1256,7 @@ def _get_token_overlap_tail(self, text: str, target_tokens: int) -> str: return text[pos:] - @lazyproperty + @cached_property def _patterns(self) -> tuple[tuple[regex.Pattern[str], int], ...]: """Sequence of (pattern, len) pairs to match against. diff --git a/unstructured/chunking/dispatch.py b/unstructured/chunking/dispatch.py index a229d59432..bbcbd9d580 100644 --- a/unstructured/chunking/dispatch.py +++ b/unstructured/chunking/dispatch.py @@ -9,6 +9,7 @@ import dataclasses as dc import functools import inspect +from functools import cached_property from typing import Any, Callable, Iterable, Optional, Protocol from typing_extensions import ParamSpec @@ -16,7 +17,7 @@ from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import Element -from unstructured.utils import get_call_args_applying_defaults, lazyproperty +from unstructured.utils import get_call_args_applying_defaults _P = ParamSpec("_P") @@ -113,7 +114,7 @@ class _ChunkerSpec: chunker: Chunker """The "chunk_by_{x}() function that implements this chunking strategy.""" - @lazyproperty + @cached_property def kw_arg_names(self) -> tuple[str, ...]: """Keyword arguments supported by this chunker. diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index 943c943f24..cfbd5cdb6b 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -5,6 +5,7 @@ from __future__ import annotations +from functools import cached_property from typing import Iterable, Iterator, Optional from unstructured.chunking.base import ( @@ -17,7 +18,6 @@ is_title, ) from unstructured.documents.elements import Element -from unstructured.utils import lazyproperty def chunk_by_title( @@ -124,7 +124,7 @@ class _ByTitleChunkingOptions(ChunkingOptions): appearing on two different pages can appear in the same chunk. """ - @lazyproperty + @cached_property def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: """The semantic-boundary detectors to be applied to break pre-chunks. @@ -140,7 +140,7 @@ def iter_boundary_predicates() -> Iterator[BoundaryPredicate]: return tuple(iter_boundary_predicates()) - @lazyproperty + @cached_property def combine_text_under_n_chars(self) -> int: """Combine consecutive text pre-chunks if former is smaller than this and both will fit. @@ -152,7 +152,7 @@ def combine_text_under_n_chars(self) -> int: arg_value = self._kwargs.get("combine_text_under_n_chars") return self.hard_max if arg_value is None else arg_value - @lazyproperty + @cached_property def multipage_sections(self) -> bool: """When False, break pre-chunks on page-boundaries.""" arg_value = self._kwargs.get("multipage_sections") diff --git a/unstructured/common/html_table.py b/unstructured/common/html_table.py index a441e5a57b..e489669b89 100644 --- a/unstructured/common/html_table.py +++ b/unstructured/common/html_table.py @@ -6,13 +6,12 @@ from __future__ import annotations import html +from functools import cached_property from typing import TYPE_CHECKING, Iterator, Sequence, cast from lxml import etree from lxml.html import fragment_fromstring -from unstructured.utils import lazyproperty - if TYPE_CHECKING: from lxml.html import HtmlElement @@ -90,7 +89,7 @@ def from_html_text(cls, html_text: str) -> HtmlTable: return cls(table) - @lazyproperty + @cached_property def html(self) -> str: """The HTML-fragment for this `` element, all on one line. @@ -105,7 +104,7 @@ def html(self) -> str: def iter_rows(self) -> Iterator[HtmlRow]: yield from (HtmlRow(tr) for tr in cast("list[HtmlElement]", self._table.xpath("./tr"))) - @lazyproperty + @cached_property def text(self) -> str: """The clean, concatenated, text for this table.""" table_text = " ".join(self._table.itertext()) @@ -119,7 +118,7 @@ class HtmlRow: def __init__(self, tr: HtmlElement): self._tr = tr - @lazyproperty + @cached_property def html(self) -> str: """Like "".""" return etree.tostring(self._tr, encoding=str) @@ -140,7 +139,7 @@ def iter_cell_texts(self) -> Iterator[str]: continue yield text - @lazyproperty + @cached_property def text_len(self) -> int: """Length of the normalized text, as it would appear in `element.text`.""" return len(" ".join(self.iter_cell_texts())) @@ -152,12 +151,12 @@ class HtmlCell: def __init__(self, td: HtmlElement): self._td = td - @lazyproperty + @cached_property def html(self) -> str: """Like "".""" return etree.tostring(self._td, encoding=str) if self.text else "
foobar
foo bar baz" - @lazyproperty + @cached_property def text(self) -> str: """Text inside `` element, empty string when no text.""" if (text := self._td.text) is None: diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 4ead6617f4..77da6b5eb6 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -9,6 +9,7 @@ import os import pathlib import uuid +from functools import cached_property from itertools import groupby from types import MappingProxyType from typing import Any, Callable, FrozenSet, Optional, Sequence, cast @@ -21,7 +22,7 @@ RelativeCoordinateSystem, ) from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA -from unstructured.utils import get_call_args_applying_defaults, lazyproperty +from unstructured.utils import get_call_args_applying_defaults Point: TypeAlias = "tuple[float, float]" Points: TypeAlias = "tuple[Point, ...]" @@ -444,7 +445,7 @@ def update(self, other: ElementMetadata) -> None: for field_name, field_value in other.fields.items(): setattr(self, field_name, field_value) - @lazyproperty + @cached_property def _known_field_names(self) -> FrozenSet[str]: """field-names for non-user-defined fields, available on all ElementMetadata instances. diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 2f3d4becf9..2b6c4bb380 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -37,6 +37,7 @@ import re import tempfile import zipfile +from functools import cached_property from typing import IO, Callable, Iterator, Optional import filetype as ft @@ -51,7 +52,7 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common.common import add_element_metadata, exactly_one from unstructured.partition.common.metadata import set_element_hierarchy -from unstructured.utils import get_call_args_applying_defaults, lazyproperty +from unstructured.utils import get_call_args_applying_defaults try: importlib.import_module("magic") @@ -283,7 +284,7 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None: return None - @lazyproperty + @cached_property def _file_type_from_file_extension(self) -> FileType | None: """Determine file-type from filename extension. @@ -348,10 +349,10 @@ def content_type(self) -> str | None: further verification. All lower-case when not `None`. """ # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a - # -- `@lazyproperty`. + # -- `@cached_property`. return self._content_type.lower() if self._content_type else None - @lazyproperty + @cached_property def encoding(self) -> str: """Character-set used to encode text of this file. @@ -359,7 +360,7 @@ def encoding(self) -> str: """ return format_encoding_str(self._encoding_arg or "utf-8") - @lazyproperty + @cached_property def extension(self) -> str: """Best filename-extension we can muster, "" when there is no available source.""" # -- get from file_path, or file when it has a name (path) -- @@ -374,13 +375,13 @@ def extension(self) -> str: # -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" -- return "" - @lazyproperty + @cached_property def file_head(self) -> bytes: """The initial bytes of the file to be recognized, for use with libmagic detection.""" with self.open() as file: return file.read(8192) - @lazyproperty + @cached_property def file_path(self) -> str | None: """Filesystem path to file to be inspected, when provided on call. @@ -392,7 +393,7 @@ def file_path(self) -> str | None: return os.path.realpath(file_path) if os.path.islink(file_path) else file_path - @lazyproperty + @cached_property def has_code_mime_type(self) -> bool: """True when `mime_type` plausibly indicates a programming language source-code file.""" mime_type = self.mime_type @@ -421,13 +422,13 @@ def has_code_mime_type(self) -> bool: ] ) - @lazyproperty + @cached_property def is_zipfile(self) -> bool: """True when file is a Zip archive.""" with self.open() as file: return zipfile.is_zipfile(file) - @lazyproperty + @cached_property def mime_type(self) -> str | None: """The best MIME-type we can get from `magic` (or `filetype` package). @@ -521,7 +522,7 @@ def rule_out_zip_content_types(self) -> None: ): self._content_type = None - @lazyproperty + @cached_property def text_head(self) -> str: """The initial characters of the text file for use with text-format differentiation. @@ -596,13 +597,13 @@ def _file_type(self) -> FileType | None: return None - @lazyproperty + @cached_property def _is_ole_file(self) -> bool: """True when file has CFB magic first 8 bytes.""" with self._ctx.open() as file: return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" - @lazyproperty + @cached_property def _ole_file_type(self) -> FileType | None: with self._ctx.open() as f: ole = OleFileIO(f) # pyright: ignore[reportUnknownVariableType] @@ -637,7 +638,7 @@ def applies(cls, ctx: _FileTypeDetectionContext) -> _TextFileDifferentiator | No else None ) - @lazyproperty + @cached_property def file_type(self) -> FileType: """Differentiated file-type for textual content. @@ -683,7 +684,7 @@ def file_type(self) -> FileType: return FileType.TXT - @lazyproperty + @cached_property def _is_csv(self) -> bool: """True when file is plausibly in Comma Separated Values (CSV) format.""" @@ -704,7 +705,7 @@ def count_commas(text: str): header_count = count_commas(lines[0]) return all(count_commas(line) == header_count for line in lines[1:]) - @lazyproperty + @cached_property def _is_eml(self) -> bool: """Checks if a text/plain file is actually a .eml file. @@ -713,7 +714,7 @@ def _is_eml(self) -> bool: """ return EMAIL_HEAD_RE.match(self._ctx.text_head) is not None - @lazyproperty + @cached_property def _is_json(self) -> bool: """True when file is JSON collection. @@ -754,7 +755,7 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None: """ return cls(ctx)._file_type - @lazyproperty + @cached_property def _file_type(self) -> FileType | None: """Differentiated file-type for a Zip archive. diff --git a/unstructured/partition/csv.py b/unstructured/partition/csv.py index 24e2d03058..87f74ac168 100644 --- a/unstructured/partition/csv.py +++ b/unstructured/partition/csv.py @@ -2,6 +2,7 @@ import contextlib import csv +from functools import cached_property from typing import IO, Any, Iterator import pandas as pd @@ -11,7 +12,7 @@ from unstructured.documents.elements import Element, ElementMetadata, Table from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date -from unstructured.utils import is_temp_file_path, lazyproperty +from unstructured.utils import is_temp_file_path DETECTION_ORIGIN: str = "csv" CSV_FIELD_LIMIT = 10 * 1048576 # 10MiB @@ -111,7 +112,7 @@ def load( infer_table_structure=infer_table_structure, )._validate() - @lazyproperty + @cached_property def delimiter(self) -> str | None: """The CSV delimiter, nominally a comma ",". @@ -132,17 +133,17 @@ def delimiter(self) -> str | None: # -- sniffing will fail on single-column csv as no default can be assumed -- return None - @lazyproperty + @cached_property def header(self) -> int | None: """Identifies the header row, if any, to Pandas, by idx.""" return 0 if self._include_header else None - @lazyproperty + @cached_property def encoding(self) -> str | None: """The encoding to use for reading the file.""" return self._encoding - @lazyproperty + @cached_property def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" return ( diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 8491555e01..1347d88134 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -8,6 +8,7 @@ import os import tempfile import zipfile +from functools import cached_property from typing import IO, Any, Iterator, Protocol, Type import docx @@ -52,7 +53,7 @@ is_us_city_state_zip, ) from unstructured.partition.utils.constants import PartitionStrategy -from unstructured.utils import is_temp_file_path, lazyproperty +from unstructured.utils import is_temp_file_path STYLE_TO_ELEMENT_MAPPING = { "Caption": Text, # TODO(robinson) - add caption element type @@ -223,12 +224,12 @@ def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT): """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs.""" cls._PicturePartitionerCls = picture_partitioner - @lazyproperty + @cached_property def document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" return docx.Document(self._docx_file) - @lazyproperty + @cached_property def include_page_breaks(self) -> bool: """When True, include `PageBreak` elements in element-stream. @@ -245,12 +246,12 @@ def increment_page_number(self) -> Iterator[PageBreak]: if self._include_page_breaks: yield PageBreak("", detection_origin=DETECTION_ORIGIN) - @lazyproperty + @cached_property def infer_table_structure(self) -> bool: """True when partitioner should compute and apply `text_as_html` metadata for tables.""" return self._infer_table_structure - @lazyproperty + @cached_property def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" if not self._file_path: @@ -260,7 +261,7 @@ def last_modified(self) -> str | None: None if is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path) ) - @lazyproperty + @cached_property def metadata_file_path(self) -> str | None: """The best available file-path for this document or `None` if unavailable.""" return self._file_path @@ -290,7 +291,7 @@ def page_number(self) -> int: """ return self._page_counter - @lazyproperty + @cached_property def picture_partitioner(self) -> PicturePartitionerT: """The sub-partitioner to use for DOCX image extraction.""" # -- Note this value has partitioning-run scope. An instance of this options class is @@ -300,7 +301,7 @@ def picture_partitioner(self) -> PicturePartitionerT: # -- ensures image extraction is processed consistently within a single document. return self._PicturePartitionerCls or _NullPicturePartitioner - @lazyproperty + @cached_property def strategy(self) -> str: """The partitioning strategy for this document. @@ -309,7 +310,7 @@ def strategy(self) -> str: """ return PartitionStrategy.HI_RES if self._strategy is None else self._strategy - @lazyproperty + @cached_property def _document_contains_pagebreaks(self) -> bool: """True when there is at least one page-break detected in the document. @@ -330,7 +331,7 @@ def _document_contains_pagebreaks(self) -> bool: return bool(self.document.element.xpath(xpath)) - @lazyproperty + @cached_property def _docx_file(self) -> str | IO[bytes]: """The Word 2007+ document file to be partitioned. @@ -540,12 +541,12 @@ def iter_row_cells_as_text(row: _Row) -> Iterator[str]: return htmlify_matrix_of_cell_texts([list(iter_row_cells_as_text(r)) for r in table.rows]) - @lazyproperty + @cached_property def _document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" return self._opts.document - @lazyproperty + @cached_property def _document_contains_sections(self) -> bool: """True when there is at least one section in the document. diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py index 30f0147357..2cbd7c9f59 100644 --- a/unstructured/partition/email.py +++ b/unstructured/partition/email.py @@ -12,6 +12,7 @@ import io import os from email.message import EmailMessage, MIMEPart +from functools import cached_property from typing import IO, Any, Final, Iterator, cast from dateutil import parser @@ -23,7 +24,6 @@ from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html from unstructured.partition.text import partition_text -from unstructured.utils import lazyproperty VALID_CONTENT_SOURCES: Final[tuple[str, ...]] = ("text/html", "text/plain") @@ -119,7 +119,7 @@ def load( kwargs=kwargs, )._validate() - @lazyproperty + @cached_property def bcc_addresses(self) -> list[str] | None: """The "blind carbon-copy" Bcc: addresses of the message.""" bccs = self.msg.get_all("Bcc") @@ -128,7 +128,7 @@ def bcc_addresses(self) -> list[str] | None: addrs = email.utils.getaddresses(bccs) return [email.utils.formataddr(addr) for addr in addrs] - @lazyproperty + @cached_property def body_part(self) -> MIMEPart | None: """The message part containing the actual textual email message. @@ -137,7 +137,7 @@ def body_part(self) -> MIMEPart | None: """ return self.msg.get_body(preferencelist=self.content_type_preference) - @lazyproperty + @cached_property def cc_addresses(self) -> list[str] | None: """The "carbon-copy" Cc: addresses of the message.""" ccs = self.msg.get_all("Cc") @@ -146,7 +146,7 @@ def cc_addresses(self) -> list[str] | None: addrs = email.utils.getaddresses(ccs) return [email.utils.formataddr(addr) for addr in addrs] - @lazyproperty + @cached_property def content_type_preference(self) -> tuple[str, ...]: """Whether to prefer HTML or plain-text body when message-body has both. @@ -155,7 +155,7 @@ def content_type_preference(self) -> tuple[str, ...]: """ return ("plain", "html") if self._content_source == "text/plain" else ("html", "plain") - @lazyproperty + @cached_property def email_metadata(self) -> ElementMetadata: """The email-specific metadata fields for this message. @@ -171,7 +171,7 @@ def email_metadata(self) -> ElementMetadata: subject=self.subject, ) - @lazyproperty + @cached_property def from_address(self) -> str | None: """The address of the message sender.""" froms = self.msg.get_all("From") @@ -182,7 +182,7 @@ def from_address(self) -> str | None: formatted_addrs = [email.utils.formataddr(addr) for addr in addrs] return formatted_addrs[0] - @lazyproperty + @cached_property def message_id(self) -> str | None: """The value of the Message-ID: header, when present.""" raw_id = self.msg.get("Message-ID") @@ -190,7 +190,7 @@ def message_id(self) -> str | None: return None return raw_id.strip().strip("<>") - @lazyproperty + @cached_property def metadata_file_path(self) -> str | None: """The best available file-path information for this email message. @@ -205,7 +205,7 @@ def metadata_file_path(self) -> str | None: """ return self._metadata_file_path or self._file_path or None - @lazyproperty + @cached_property def metadata_last_modified(self) -> str | None: """The best available last-modified date for this message, as an ISO8601 string. @@ -221,7 +221,7 @@ def metadata_last_modified(self) -> str | None: """ return self._metadata_last_modified or self._sent_date or self._filesystem_last_modified - @lazyproperty + @cached_property def msg(self) -> EmailMessage: """The Python stdlib `email.message.EmailMessage` object parsed from the EML file.""" if self._file_path is not None: @@ -236,7 +236,7 @@ def msg(self) -> EmailMessage: return cast(EmailMessage, email.message_from_bytes(file_bytes, policy=email.policy.default)) - @lazyproperty + @cached_property def partitioning_kwargs(self) -> dict[str, Any]: """The "extra" keyword arguments received by `partition_email()`. @@ -245,7 +245,7 @@ def partitioning_kwargs(self) -> dict[str, Any]: """ return self._kwargs - @lazyproperty + @cached_property def process_attachments(self) -> bool: """When True, partition attachments in addition to the email message body. @@ -254,7 +254,7 @@ def process_attachments(self) -> bool: """ return self._process_attachments - @lazyproperty + @cached_property def subject(self) -> str | None: """The value of the Subject: header, when present.""" subject = self.msg.get("Subject") @@ -262,7 +262,7 @@ def subject(self) -> str | None: return None return subject - @lazyproperty + @cached_property def to_addresses(self) -> list[str] | None: """The To: addresses of the message.""" tos = self.msg.get_all("To") @@ -271,12 +271,12 @@ def to_addresses(self) -> list[str] | None: addrs = email.utils.getaddresses(tos) return [email.utils.formataddr(addr) for addr in addrs] - @lazyproperty + @cached_property def _filesystem_last_modified(self) -> str | None: """Last-modified retrieved from filesystem when a file-path was provided, None otherwise.""" return get_last_modified_date(self._file_path) if self._file_path else None - @lazyproperty + @cached_property def _sent_date(self) -> str | None: """ISO-8601 str representation of message sent-date, if available.""" date_str = self.msg.get("Date") @@ -416,7 +416,7 @@ def _iter_elements(self) -> Iterator[Element]: e.metadata.attached_to_filename = self._attached_to_filename yield e - @lazyproperty + @cached_property def _attached_to_filename(self) -> str | None: """The file-name (no path) of the message. `None` if not available.""" file_path = self._ctx.metadata_file_path @@ -424,12 +424,12 @@ def _attached_to_filename(self) -> str | None: return None return os.path.basename(file_path) - @lazyproperty + @cached_property def _attachment_file_name(self) -> str | None: """The original name of the attached file, `None` if not present in the MIME part.""" return self._attachment.get_filename() - @lazyproperty + @cached_property def _file_bytes(self) -> bytes: """The bytes of the attached file.""" content = self._attachment.get_content() diff --git a/unstructured/partition/html/parser.py b/unstructured/partition/html/parser.py index f3df2b8a15..5f4d09d401 100644 --- a/unstructured/partition/html/parser.py +++ b/unstructured/partition/html/parser.py @@ -77,6 +77,7 @@ import re from collections import defaultdict, deque +from functools import cached_property from types import MappingProxyType from typing import Any, Iterable, Iterator, Mapping, NamedTuple, Sequence, cast @@ -104,7 +105,6 @@ is_possible_narrative_text, is_us_city_state_zip, ) -from unstructured.utils import lazyproperty # ------------------------------------------------------------------------------------------------ # DOMAIN MODEL @@ -362,7 +362,7 @@ def iter_elements(self) -> Iterator[Element]: yield from block_item.iter_elements() yield from self._element_from_text_or_tail(block_item.tail or "", q) - @lazyproperty + @cached_property def _element_accum(self) -> _ElementAccumulator: """Text-segment accumulator suitable for this block-element.""" return _ElementAccumulator(self) @@ -477,7 +477,7 @@ class Pre(BlockItem): _ElementCls = CodeSnippet - @lazyproperty + @cached_property def _element_accum(self) -> _ElementAccumulator: """Text-segment accumulator suitable for this block-element.""" return _PreElementAccumulator(self) diff --git a/unstructured/partition/html/partition.py b/unstructured/partition/html/partition.py index bf29d61781..e40a6c30a4 100644 --- a/unstructured/partition/html/partition.py +++ b/unstructured/partition/html/partition.py @@ -4,6 +4,7 @@ from __future__ import annotations +from functools import cached_property from typing import IO, Any, Callable, Iterator, List, Literal, Optional, cast import requests @@ -19,7 +20,7 @@ ontology_to_unstructured_elements, parse_html_to_ontology, ) -from unstructured.utils import is_temp_file_path, lazyproperty +from unstructured.utils import is_temp_file_path @apply_metadata(FileType.HTML) @@ -139,12 +140,12 @@ def __init__( self._extract_image_block_types = extract_image_block_types self._extract_image_block_to_payload = extract_image_block_to_payload - @lazyproperty + @cached_property def detection_origin(self) -> str | None: """Trace of initial partitioner to be included in metadata for debugging purposes.""" return self._detection_origin - @lazyproperty + @cached_property def html_text(self) -> str: """The HTML document as a string, loaded from wherever the caller specified.""" if self._file_path: @@ -170,7 +171,7 @@ def html_text(self) -> str: raise ValueError("Exactly one of filename, file, text, or url must be specified.") - @lazyproperty + @cached_property def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" return ( @@ -179,17 +180,17 @@ def last_modified(self) -> str | None: else get_last_modified_date(self._file_path) ) - @lazyproperty + @cached_property def skip_headers_and_footers(self) -> bool: """When True, elements located within a header or footer are pruned.""" return self._skip_headers_and_footers - @lazyproperty + @cached_property def html_parser_version(self) -> Literal["v1", "v2"]: """When html_parser_version=='v2', HTML elements follow ontology schema.""" return self._html_parser_version - @lazyproperty + @cached_property def add_img_alt_text(self) -> bool: """When True, the alternative text of images is included in the output.""" return self._image_alt_mode == "to_text" @@ -241,7 +242,7 @@ def _iter_elements(self) -> Iterator[Element]: e.metadata.image_mime_type = None yield e - @lazyproperty + @cached_property def _main(self) -> Flow: """The root HTML element.""" # NOTE(scanny) - get `html_text` first so any encoding error raised is not confused with a @@ -275,7 +276,7 @@ def _main(self) -> Flow: return cast(Flow, body) return cast(Flow, root) - @lazyproperty + @cached_property def _from_ontology(self) -> List[Element]: """Convert an ontology elements represented in HTML to an ontology element.""" html_text = self._opts.html_text diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index 1fe5e9a5b6..0894b0ba18 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -3,6 +3,7 @@ import os import re import tempfile +from functools import cached_property from typing import IO, Any, Iterator, Optional from oxmsg import Message @@ -15,7 +16,7 @@ from unstructured.partition.common.metadata import get_last_modified_date from unstructured.partition.html import partition_html from unstructured.partition.text import partition_text -from unstructured.utils import is_temp_file_path, lazyproperty +from unstructured.utils import is_temp_file_path def partition_msg( @@ -75,7 +76,7 @@ def __init__( self._partition_attachments = partition_attachments self._kwargs = kwargs - @lazyproperty + @cached_property def extra_msg_metadata(self) -> ElementMetadata: """ElementMetadata suitable for use on an element formed from message content. @@ -109,7 +110,7 @@ def extra_msg_metadata(self) -> ElementMetadata: return element_metadata - @lazyproperty + @cached_property def is_encrypted(self) -> bool: """True when message is encrypted.""" # NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content @@ -121,7 +122,7 @@ def is_encrypted(self) -> bool: # encryption. return "encrypted" in self.msg.message_headers.get("Content-Type", "") - @lazyproperty + @cached_property def metadata_file_path(self) -> str | None: """Best available path for MSG file. @@ -130,23 +131,23 @@ def metadata_file_path(self) -> str | None: """ return self._metadata_file_path or self._file_path - @lazyproperty + @cached_property def metadata_last_modified(self) -> str | None: """Caller override for `.metadata.last_modified` to be applied to all elements.""" email_date = sent_date.isoformat() if (sent_date := self.msg.sent_date) else None return self._metadata_last_modified or email_date or self._last_modified - @lazyproperty + @cached_property def msg(self) -> Message: """The `oxmsg.Message` object loaded from file or filename.""" return Message.load(self._msg_file) - @lazyproperty + @cached_property def partition_attachments(self) -> bool: """True when message attachments should also be partitioned.""" return self._partition_attachments - @lazyproperty + @cached_property def partitioning_kwargs(self) -> dict[str, Any]: """The "extra" keyword arguments received by `partition_msg()`. @@ -155,7 +156,7 @@ def partitioning_kwargs(self) -> dict[str, Any]: """ return self._kwargs - @lazyproperty + @cached_property def _last_modified(self) -> str | None: """The best last-modified date available from source-file, None if not available.""" if not self._file_path or is_temp_file_path(self._file_path): @@ -163,7 +164,7 @@ def _last_modified(self) -> str | None: return get_last_modified_date(self._file_path) - @lazyproperty + @cached_property def _msg_file(self) -> str | IO[bytes]: """The source for the bytes of the message, either a file-path or a file-like object.""" if file_path := self._file_path: @@ -200,7 +201,7 @@ def _iter_message_elements(self) -> Iterator[Element]: for attachment in self._attachments: yield from _AttachmentPartitioner.iter_elements(attachment, self._opts) - @lazyproperty + @cached_property def _attachments(self) -> tuple[Attachment, ...]: """The `oxmsg.attachment.Attachment` objects for this message.""" return tuple(self._opts.msg.attachments) @@ -281,7 +282,7 @@ def _iter_elements(self) -> Iterator[Element]: e.metadata.attached_to_filename = self._opts.metadata_file_path yield e - @lazyproperty + @cached_property def _attachment_file_name(self) -> str: """The original name of the attached file, no path. @@ -304,7 +305,7 @@ def _attachment_file_name(self) -> str: return safe_filename - @lazyproperty + @cached_property def _attachment_last_modified(self) -> str | None: """ISO8601 string timestamp of attachment last-modified date. @@ -316,7 +317,7 @@ def _attachment_last_modified(self) -> str | None: return last_modified.isoformat() return self._opts.metadata_last_modified - @lazyproperty + @cached_property def _file_bytes(self) -> bytes: """The bytes of the attached file.""" return self._attachment.file_bytes or b"" diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py index 0fc46c773e..8652234255 100644 --- a/unstructured/partition/pptx.py +++ b/unstructured/partition/pptx.py @@ -7,6 +7,7 @@ from __future__ import annotations import io +from functools import cached_property from tempfile import SpooledTemporaryFile from typing import IO, Any, Iterator, Protocol, Sequence @@ -42,7 +43,7 @@ is_possible_title, ) from unstructured.partition.utils.constants import PartitionStrategy -from unstructured.utils import is_temp_file_path, lazyproperty +from unstructured.utils import is_temp_file_path DETECTION_ORIGIN = "pptx" @@ -314,7 +315,7 @@ def sort_key(shape: BaseShape) -> tuple[int, int]: return slide.shapes.title, sorted(iter_shapes(slide.shapes), key=sort_key) - @lazyproperty + @cached_property def _presentation(self) -> Presentation: """The python-pptx `Presentation` object loaded from the provided source file.""" return pptx.Presentation(self._opts.pptx_file) @@ -363,7 +364,7 @@ def register_picture_partitioner(cls, picture_partitioner: AbstractPicturePartit """Specify a pluggable sub-partitioner to be used for partitioning PPTX images.""" cls._PicturePartitionerCls = picture_partitioner - @lazyproperty + @cached_property def include_page_breaks(self) -> bool: """When True, include `PageBreak` elements in element-stream. @@ -373,7 +374,7 @@ def include_page_breaks(self) -> bool: """ return self._include_page_breaks - @lazyproperty + @cached_property def include_slide_notes(self) -> bool: """When True, also partition any text found in slide notes as part of each slide.""" return False if self._include_slide_notes is None else self._include_slide_notes @@ -394,12 +395,12 @@ def increment_page_number(self) -> Iterator[PageBreak]: ), ) - @lazyproperty + @cached_property def infer_table_structure(self) -> bool: """True when partitioner should compute and apply `text_as_html` metadata for tables.""" return self._infer_table_structure - @lazyproperty + @cached_property def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" if not self._file_path: @@ -409,7 +410,7 @@ def last_modified(self) -> str | None: None if is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path) ) - @lazyproperty + @cached_property def metadata_file_path(self) -> str | None: """The best available file-path for this document or `None` if unavailable.""" return self._file_path @@ -419,7 +420,7 @@ def page_number(self) -> int: """The current page (slide) number.""" return self._page_counter - @lazyproperty + @cached_property def picture_partitioner(self) -> AbstractPicturePartitioner: """The sub-partitioner to use for PPTX Picture shapes.""" # -- Note this value has partitioning-run scope. An instance of this options class is @@ -436,7 +437,7 @@ def picture_partitioner(self) -> AbstractPicturePartitioner: else self._PicturePartitionerCls ) - @lazyproperty + @cached_property def pptx_file(self) -> str | IO[bytes]: """The PowerPoint document file to be partitioned. @@ -460,7 +461,7 @@ def pptx_file(self) -> str | IO[bytes]: "No PPTX document specified, either `filename` or `file` argument must be provided" ) - @lazyproperty + @cached_property def strategy(self) -> str: """The requested partitioning strategy. diff --git a/unstructured/partition/xlsx.py b/unstructured/partition/xlsx.py index de7ae6c094..ffffdac4cb 100644 --- a/unstructured/partition/xlsx.py +++ b/unstructured/partition/xlsx.py @@ -3,6 +3,7 @@ from __future__ import annotations import io +from functools import cached_property from typing import IO, Any, Iterator, Optional import networkx as nx @@ -33,7 +34,6 @@ is_possible_numbered_list, is_possible_title, ) -from unstructured.utils import lazyproperty _CellCoordinate: TypeAlias = "tuple[int, int]" @@ -153,7 +153,7 @@ def __init__( self._include_header = include_header self._infer_table_structure = infer_table_structure - @lazyproperty + @cached_property def find_subtable(self) -> bool: """True when partitioner should detect and emit separate `Table` elements for subtables. @@ -161,32 +161,32 @@ def find_subtable(self) -> bool: """ return self._find_subtable - @lazyproperty + @cached_property def header_row_idx(self) -> int | None: """The index of the row Pandas should treat as column-headings. Either 0 or None.""" return 0 if self._include_header else None - @lazyproperty + @cached_property def include_header(self) -> bool: """True when column headers should be included in tables.""" return self._include_header - @lazyproperty + @cached_property def infer_table_structure(self) -> bool: """True when partitioner should compute and apply `text_as_html` metadata.""" return self._infer_table_structure - @lazyproperty + @cached_property def last_modified(self) -> Optional[str]: """The best last-modified date available, None if no sources are available.""" return get_last_modified_date(self._file_path) if self._file_path else None - @lazyproperty + @cached_property def metadata_file_path(self) -> str | None: """The best available file-path for this document or `None` if unavailable.""" return self._file_path - @lazyproperty + @cached_property def sheets(self) -> dict[str, pd.DataFrame]: """The spreadsheet worksheets, each as a data-frame mapped by sheet-name.""" try: @@ -201,7 +201,7 @@ def sheets(self) -> dict[str, pd.DataFrame]: io.BytesIO(self._file_bytes), sheet_name=None, header=self.header_row_idx ) - @lazyproperty + @cached_property def _file_bytes(self) -> bytes: if file := self._file: file.seek(0) @@ -224,7 +224,7 @@ def __init__(self, worksheet: pd.DataFrame, cell_coordinate_set: set[_CellCoordi self._worksheet = worksheet self._cell_coordinate_set = cell_coordinate_set - @lazyproperty + @cached_property def max_x(self) -> int: """The right-most column index of the connected component.""" return self._extents[2] @@ -239,12 +239,12 @@ def merge(self, other: _ConnectedComponent) -> _ConnectedComponent: self._worksheet, self._cell_coordinate_set.union(other._cell_coordinate_set) ) - @lazyproperty + @cached_property def min_x(self) -> int: """The left-most column index of the connected component.""" return self._extents[0] - @lazyproperty + @cached_property def subtable(self) -> pd.DataFrame: """The connected region of the worksheet as a `DataFrame`. @@ -254,7 +254,7 @@ def subtable(self) -> pd.DataFrame: min_x, min_y, max_x, max_y = self._extents return self._worksheet.iloc[min_x : max_x + 1, min_y : max_y + 1] - @lazyproperty + @cached_property def _extents(self) -> tuple[int, int, int, int]: """Compute bounding box of this connected component.""" min_x, min_y, max_x, max_y = float("inf"), float("inf"), float("-inf"), float("-inf") @@ -288,7 +288,7 @@ def from_worksheet_df(cls, worksheet_df: pd.DataFrame) -> Self: """Construct from a worksheet dataframe produced by reading Excel with pandas.""" return cls(worksheet_df) - @lazyproperty + @cached_property def _connected_components(self) -> list[_ConnectedComponent]: """The `_ConnectedComponent` objects comprising this collection.""" # -- produce a 2D-graph representing the populated cells of the worksheet (or subsheet). @@ -367,7 +367,7 @@ class _SubtableParser: def __init__(self, subtable: pd.DataFrame): self._subtable = subtable - @lazyproperty + @cached_property def core_table(self) -> pd.DataFrame | None: """The part between the leading and trailing single-cell rows, if any.""" core_table_start = len(self._leading_single_cell_row_indices) @@ -395,7 +395,7 @@ def iter_trailing_single_cell_rows_texts(self) -> Iterator[str]: for row_idx in self._trailing_single_cell_row_indices: yield self._subtable.iloc[row_idx].dropna().iloc[0] # pyright: ignore - @lazyproperty + @cached_property def _leading_single_cell_row_indices(self) -> tuple[int, ...]: """Index of each leading single-cell row in subtable, in top-down order.""" @@ -407,7 +407,7 @@ def iter_leading_single_cell_row_indices() -> Iterator[int]: return tuple(iter_leading_single_cell_row_indices()) - @lazyproperty + @cached_property def _single_cell_row_indices(self) -> tuple[int, ...]: """Index of each single-cell row in subtable, in top-down order.""" @@ -419,7 +419,7 @@ def iter_single_cell_row_idxs() -> Iterator[int]: return tuple(iter_single_cell_row_idxs()) - @lazyproperty + @cached_property def _trailing_single_cell_row_indices(self) -> tuple[int, ...]: """Index of each trailing single-cell row in subtable, in top-down order.""" # -- if all subtable rows are single-cell, then by convention they are all leading -- diff --git a/unstructured/utils.py b/unstructured/utils.py index 7674cab6fe..578135502a 100644 --- a/unstructured/utils.py +++ b/unstructured/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import asyncio -import functools import importlib import inspect import json @@ -16,7 +15,6 @@ TYPE_CHECKING, Any, Callable, - Generic, Iterable, Iterator, List, @@ -68,115 +66,6 @@ def is_temp_file_path(file_path: str) -> bool: return file_path.startswith(tempfile.gettempdir()) -class lazyproperty(Generic[_T]): - """Decorator like @property, but evaluated only on first access. - - Like @property, this can only be used to decorate methods having only a `self` parameter, and - is accessed like an attribute on an instance, i.e. trailing parentheses are not used. Unlike - @property, the decorated method is only evaluated on first access; the resulting value is - cached and that same value returned on second and later access without re-evaluation of the - method. - - Like @property, this class produces a *data descriptor* object, which is stored in the __dict__ - of the *class* under the name of the decorated method ('fget' nominally). The cached value is - stored in the __dict__ of the *instance* under that same name. - - Because it is a data descriptor (as opposed to a *non-data descriptor*), its `__get__()` method - is executed on each access of the decorated attribute; the __dict__ item of the same name is - "shadowed" by the descriptor. - - While this may represent a performance improvement over a property, its greater benefit may be - its other characteristics. One common use is to construct collaborator objects, removing that - "real work" from the constructor, while still only executing once. It also de-couples client - code from any sequencing considerations; if it's accessed from more than one location, it's - assured it will be ready whenever needed. - - Loosely based on: https://stackoverflow.com/a/6849299/1902513. - - A lazyproperty is read-only. There is no counterpart to the optional "setter" (or deleter) - behavior of an @property. This is critically important to maintaining its immutability and - idempotence guarantees. Attempting to assign to a lazyproperty raises AttributeError - unconditionally. - - The parameter names in the methods below correspond to this usage example:: - - class Obj(object) - - @lazyproperty - def fget(self): - return 'some result' - - obj = Obj() - - Not suitable for wrapping a function (as opposed to a method) because it is not callable. - """ - - def __init__(self, fget: Callable[..., _T]) -> None: - """*fget* is the decorated method (a "getter" function). - - A lazyproperty is read-only, so there is only an *fget* function (a regular - @property can also have an fset and fdel function). This name was chosen for - consistency with Python's `property` class which uses this name for the - corresponding parameter. - """ - # --- maintain a reference to the wrapped getter method - self._fget = fget - # --- and store the name of that decorated method - self._name = fget.__name__ - # --- adopt fget's __name__, __doc__, and other attributes - functools.update_wrapper(self, fget) # pyright: ignore - - def __get__(self, obj: Any, type: Any = None) -> _T: - """Called on each access of 'fget' attribute on class or instance. - - *self* is this instance of a lazyproperty descriptor "wrapping" the property - method it decorates (`fget`, nominally). - - *obj* is the "host" object instance when the attribute is accessed from an - object instance, e.g. `obj = Obj(); obj.fget`. *obj* is None when accessed on - the class, e.g. `Obj.fget`. - - *type* is the class hosting the decorated getter method (`fget`) on both class - and instance attribute access. - """ - # --- when accessed on class, e.g. Obj.fget, just return this descriptor - # --- instance (patched above to look like fget). - if obj is None: - return self # type: ignore - - # --- when accessed on instance, start by checking instance __dict__ for - # --- item with key matching the wrapped function's name - value = obj.__dict__.get(self._name) - if value is None: - # --- on first access, the __dict__ item will be absent. Evaluate fget() - # --- and store that value in the (otherwise unused) host-object - # --- __dict__ value of same name ('fget' nominally) - value = self._fget(obj) - obj.__dict__[self._name] = value - return cast(_T, value) - - def __set__(self, obj: Any, value: Any) -> None: - """Raises unconditionally, to preserve read-only behavior. - - This decorator is intended to implement immutable (and idempotent) object - attributes. For that reason, assignment to this property must be explicitly - prevented. - - If this __set__ method was not present, this descriptor would become a - *non-data descriptor*. That would be nice because the cached value would be - accessed directly once set (__dict__ attrs have precedence over non-data - descriptors on instance attribute lookup). The problem is, there would be - nothing to stop assignment to the cached value, which would overwrite the result - of `fget()` and break both the immutability and idempotence guarantees of this - decorator. - - The performance with this __set__() method in place was roughly 0.4 usec per - access when measured on a 2.8GHz development machine; so quite snappy and - probably not a rich target for optimization efforts. - """ - raise AttributeError("can't set attribute") - - def save_as_jsonl(data: list[dict[str, Any]], filename: str) -> None: with open(filename, "w+") as output_file: output_file.writelines(json.dumps(datum) + "\n" for datum in data)