From 78a296c06009499127afc04ddb00f1b62ea7e08c Mon Sep 17 00:00:00 2001 From: jansaldo Date: Tue, 17 Mar 2026 17:19:39 +0000 Subject: [PATCH 01/28] =?UTF-8?q?=E2=9C=A8=20feat(extractors):=20use=20pym?= =?UTF-8?q?updf=20layout=20for=20pdf=20text=20extraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/extractors/pdf.py | 20 +---- aymurai/text/extractors/utils.py | 150 ++++++++++++++----------------- 2 files changed, 67 insertions(+), 103 deletions(-) diff --git a/aymurai/text/extractors/pdf.py b/aymurai/text/extractors/pdf.py index 0e83c30..c672dfe 100644 --- a/aymurai/text/extractors/pdf.py +++ b/aymurai/text/extractors/pdf.py @@ -9,27 +9,11 @@ class PdfExtractor(BaseExtractor): extension = "pdf" - def extract(self, path: Path, y_tolerance: float | None = None, **_: Any) -> str: - """ - Extract normalized text from a PDF document. - - Args: - path (Path): Input document path. - y_tolerance (float | None, optional): Maximum vertical gap used to - merge nearby text blocks. If None, it is estimated from the - document. Defaults to None. - **_ (Any): Ignored extra keyword arguments for backward compatibility. - - Returns: - str: Cleaned textual content. - - Raises: - InvalidFile: If the file is unreadable or extraction fails. - """ + def extract(self, path: Path, **_: Any) -> str: file_path = self.ensure_file(path) try: - return pdf_to_text(file_path, y_tolerance=y_tolerance) + return pdf_to_text(file_path) except (OSError, ValueError) as exc: raise InvalidFile(str(exc)) from exc except Exception as exc: diff --git a/aymurai/text/extractors/utils.py b/aymurai/text/extractors/utils.py index 009b562..a091440 100644 --- a/aymurai/text/extractors/utils.py +++ b/aymurai/text/extractors/utils.py @@ -1,12 +1,13 @@ -import statistics +import re import unicodedata import xml.etree.ElementTree as ET import zipfile from pathlib import Path -from typing import Any +from typing import AbstractSet, Any -import numpy as np import pymupdf +import pymupdf.layout # noqa: F401 # activates layout support +import pymupdf4llm import xmltodict from lxml import etree from more_itertools import flatten @@ -18,6 +19,7 @@ ODT_NS = {"text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0"} +PDF_SKIP_BOX_CLASSES = frozenset({"picture", "formula", "table"}) def normalize_text(text: str) -> str: @@ -33,109 +35,87 @@ def normalize_text(text: str) -> str: return unicodedata.normalize("NFKC", text) -def _compute_median_margin_between_blocks(pdf_path: str) -> float: +def _clean_pdf_box_text(text: str, box_class: str) -> str: """ - Computes the median vertical margin between text blocks in a PDF. + Clean box-level PDF text while preserving the original layout content. + Args: - pdf_path (str): Path to the PDF file. + text (str): Raw text sliced from a page box. + box_class (str): Box class emitted by ``pymupdf4llm``. + Returns: - float: Median margin between text blocks (in points). + str: Cleaned, normalized box text. """ - margins = [] - - with pymupdf.open(pdf_path) as doc: - for page in doc: - # Extract all text blocks from the page - blocks = page.get_text("blocks") - - # Sort blocks by their top y-coordinate (y0) - blocks_sorted = sorted(blocks, key=lambda b: b[1]) - - # Compute vertical margins between consecutive blocks - for i in range(1, len(blocks_sorted)): - previous_block = blocks_sorted[i - 1] - current_block = blocks_sorted[i] - - # Calculate the vertical margin - previous_y1 = previous_block[3] # Bottom of the previous block - current_y0 = current_block[1] # Top of the current block - margin = current_y0 - previous_y1 + text = normalize_text(text).strip() + if box_class == "footnote": + text = re.sub(r"(?m)^>\s?", "", text) + return text - if margin > 0: # Ignore overlapping blocks - margins.append(margin) - # Compute and return the median margin - if margins: - return statistics.median(margins) - else: - return 0.0 # Return 0 if no margins were found - - -def _extract_and_merge_paragraphs(pdf_path: str, y_tolerance: float = 5) -> list[str]: +def pdf_to_paragraphs( + file_path: Path | str, + *, + include_headers: bool = True, + include_footers: bool = True, + skip_box_classes: AbstractSet[str] = PDF_SKIP_BOX_CLASSES, +) -> list[str]: """ - Extracts and merges paragraphs from a PDF by grouping close text blocks. + Extract paragraph-like layout units from a PDF using PyMuPDF layout parsing. + Args: - pdf_path (str): Path to the PDF file. - y_tolerance (float, optional): Maximum vertical gap (in points) to consider blocks part of the same paragraph. - Defaults to 5. + file_path (Path | str): Path to the PDF document. + include_headers (bool): Whether to keep header boxes. + include_footers (bool): Whether to keep footer boxes. + skip_box_classes (AbstractSet[str]): Layout box classes to ignore. + Returns: - list[str]: A list of merged paragraphs as strings. + list[str]: Normalized paragraph strings extracted from the PDF. """ - paragraphs = [] - current_paragraph = [] - last_y1 = None - - with pymupdf.open(pdf_path) as doc: - for page in doc: - # Extract all text blocks from the page - blocks = page.get_text("blocks") - - # Sort blocks by their top y-coordinate (y0) - blocks_sorted = sorted(blocks, key=lambda b: b[1]) - - for block in blocks_sorted: - x0, y0, x1, y1, text, *_ = block - - if last_y1 is not None and (y0 - last_y1) > y_tolerance: - # If the gap between blocks is too large, start a new paragraph - if current_paragraph: - paragraphs.append(" ".join(current_paragraph)) - current_paragraph = [] - - current_paragraph.append(text) - last_y1 = y1 - - if current_paragraph: - paragraphs.append(" ".join(current_paragraph)) - current_paragraph = [] + logger.debug("Extracting layout paragraphs from PDF: %s", file_path) + + with pymupdf.open(str(file_path)) as doc: + parsed_doc = pymupdf4llm.parse_document( + doc, + filename=str(file_path), + show_progress=False, + force_text=True, + use_ocr=False, + force_ocr=False, + ) + + chunks = parsed_doc.to_text( + page_chunks=True, + header=include_headers, + footer=include_footers, + show_progress=False, + ) + + paragraphs: list[str] = [] + for chunk in chunks: + page_text = chunk.get("text") or "" + for box in chunk.get("page_boxes") or []: + if box.get("class") in skip_box_classes: + continue + + start, stop = box.get("pos", (0, 0)) + text = _clean_pdf_box_text(page_text[start:stop], box.get("class") or "") + if text: + paragraphs.append(text) return paragraphs -def pdf_to_text( - file_path: Path | str, - y_tolerance: float | None = None, -) -> str: +def pdf_to_text(file_path: Path | str) -> str: """ - Extract text from a PDF file and return normalized plain text. + Extract normalized plain text from a PDF using filtered layout boxes. Args: - file_path (Path): Path to the PDF document. - y_tolerance (float, optional): Maximum vertical gap (in points) to consider blocks part of the same paragraph. - If None, it will be computed as the median margin between blocks. Defaults to None. + file_path (Path | str): Path to the PDF document. Returns: str: Cleaned textual content extracted from the PDF. """ - logger.info("Extracting text from PDF: %s", file_path) - - if y_tolerance is None: - y_tolerance = _compute_median_margin_between_blocks(file_path) - - paragraphs = _extract_and_merge_paragraphs(file_path, np.ceil(y_tolerance)) - docu = "\n\n".join(paragraphs) - - return normalize_text(docu) + return "\n\n".join(pdf_to_paragraphs(file_path)) def load_xml_from_docx(path: Path, xmlfile: str = "word/footnotes.xml") -> Any | None: From ff7c9d36c1b1c7ae9631fe856ef73244b962d9a6 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Tue, 17 Mar 2026 17:28:02 +0000 Subject: [PATCH 02/28] =?UTF-8?q?=E2=9C=A8=20feat(normalization):=20enhanc?= =?UTF-8?q?e=20document=20normalization=20to=20preserve=20paragraph=20stru?= =?UTF-8?q?cture?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../routers/misc/document_extract.py | 10 ++- aymurai/text/normalize.py | 67 +++++++++++++------ 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/aymurai/api/endpoints/routers/misc/document_extract.py b/aymurai/api/endpoints/routers/misc/document_extract.py index 37b7d0a..56e4eaa 100644 --- a/aymurai/api/endpoints/routers/misc/document_extract.py +++ b/aymurai/api/endpoints/routers/misc/document_extract.py @@ -31,7 +31,7 @@ def extraction(path: str) -> str: str: Extracted text from the document. """ text = extract_document(path) - return document_normalize(text) if text else "" + return document_normalize(text, preserve_paragraphs=True) if text else "" def run_safe_text_extraction( @@ -112,8 +112,12 @@ def plain_text_extractor(file: UploadFile) -> Document: document_id = data_to_uuid(data) - paragraphs = [line.strip() for line in document.split("\n") if line.strip()] - paragraphs = [re.sub(r"\s{2,}", " ", line) for line in paragraphs] + paragraphs = [ + paragraph.strip() + for paragraph in re.split(r"\n\s*\n+", document) + if paragraph.strip() + ] + paragraphs = [re.sub(r"[ \t]{2,}", " ", paragraph) for paragraph in paragraphs] paragraphs = list(unique_justseen(paragraphs)) return Document(document=paragraphs, document_id=document_id) diff --git a/aymurai/text/normalize.py b/aymurai/text/normalize.py index 9027a0d..6ed7cb9 100644 --- a/aymurai/text/normalize.py +++ b/aymurai/text/normalize.py @@ -2,45 +2,72 @@ import unicodedata -def document_normalize(text: str) -> str: - """Normalize extracted text from documents - * join invalid newlines - * remove continous whitespaces +def _normalize_document_characters(text: str) -> str: + """ + Apply character-level normalization without changing document structure. Args: - text (str): document + text (str): Raw extracted document text. Returns: - str: normalized + str: Character-normalized text. """ - - # normalize character encodings - # text = unicodedata.normalize("NFKD", text) + text = text.replace("\r\n", "\n").replace("\r", "\n") text = unicodedata.normalize("NFKC", text) + text = re.sub(r"(“|”)", '"', text) + text = text.replace("\\/", "/") + text = re.sub(r"[ \t]{2,}", " ", text) + return text + - # remove continous whitespace - text = re.sub(r" {2,}", r" ", text) +def _normalize_paragraph_text(text: str) -> str: + """ + Normalize text inside a single paragraph while preserving paragraph borders. + + Args: + text (str): Paragraph text. + + Returns: + str: Normalized paragraph content. + """ + text = re.sub(r"[ \t]*\n[ \t]*", "\n", text.strip()) # delete newline if NEXT char is: # - lower character or a number - # - punctuanion + # - punctuation text = re.sub(r"\n([a-z0-9;:,\.])", r" \g<1>", text) # delete newline if PREVIOUS char is: # - quote mark - # - punctuanions (except '.' because possible ambiguity) + # - punctuations (except '.' because possible ambiguity) text = re.sub(r"([\w,\"-])\n", r"\g<1> ", text) # cleanup some junk - # - multiple newlines, hyphens - text = re.sub(r"\n{2,}", "\n", text) text = re.sub(r"[-]{2,}", "-", text) text = re.sub(r"\.-", ".", text) + text = re.sub(r" {2,}", " ", text) + return text.strip() - # quotation marks - text = re.sub(r"(“|”)", '"', text) - # scaped slashes - text = text.replace("\/", "/") +def document_normalize(text: str, *, preserve_paragraphs: bool = False) -> str: + """Normalize extracted text from documents. - return text + Args: + text (str): Document text. + preserve_paragraphs (bool): Preserve blank-line paragraph boundaries. + + Returns: + str: Normalized document text. + """ + text = _normalize_document_characters(text) + + if preserve_paragraphs: + paragraphs = [ + _normalize_paragraph_text(paragraph) + for paragraph in re.split(r"\n\s*\n+", text) + if paragraph.strip() + ] + return "\n\n".join(paragraphs) + + text = _normalize_paragraph_text(text) + return re.sub(r"\n{2,}", "\n", text) From 6243dae92bc808ab5cd4850bec01d3a1d8668401 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Tue, 17 Mar 2026 17:36:11 +0000 Subject: [PATCH 03/28] =?UTF-8?q?=F0=9F=93=9D=20docs:=20document=20default?= =?UTF-8?q?=20values=20for=20extractor=20and=20normalization=20helpers?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/extractors/utils.py | 6 +++--- aymurai/text/normalize.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aymurai/text/extractors/utils.py b/aymurai/text/extractors/utils.py index a091440..0137144 100644 --- a/aymurai/text/extractors/utils.py +++ b/aymurai/text/extractors/utils.py @@ -64,9 +64,9 @@ def pdf_to_paragraphs( Args: file_path (Path | str): Path to the PDF document. - include_headers (bool): Whether to keep header boxes. - include_footers (bool): Whether to keep footer boxes. - skip_box_classes (AbstractSet[str]): Layout box classes to ignore. + include_headers (bool): Whether to keep header boxes. Defaults to True. + include_footers (bool): Whether to keep footer boxes. Defaults to True. + skip_box_classes (AbstractSet[str]): Layout box classes to ignore. Defaults to PDF_SKIP_BOX_CLASSES. Returns: list[str]: Normalized paragraph strings extracted from the PDF. diff --git a/aymurai/text/normalize.py b/aymurai/text/normalize.py index 6ed7cb9..4154533 100644 --- a/aymurai/text/normalize.py +++ b/aymurai/text/normalize.py @@ -54,7 +54,7 @@ def document_normalize(text: str, *, preserve_paragraphs: bool = False) -> str: Args: text (str): Document text. - preserve_paragraphs (bool): Preserve blank-line paragraph boundaries. + preserve_paragraphs (bool): Preserve blank-line paragraph boundaries. Defaults to False. Returns: str: Normalized document text. From eda11cc2e3bb869f0d08a1ea50beddde0527fc6c Mon Sep 17 00:00:00 2001 From: jansaldo Date: Tue, 17 Mar 2026 18:12:43 +0000 Subject: [PATCH 04/28] =?UTF-8?q?=F0=9F=A9=B9=20fix(extractors):=20use=20p?= =?UTF-8?q?ymupdf4llm.to=5Ftext=20with=20page=5Fchunks=20for=20pdf=20parag?= =?UTF-8?q?raphs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/extractors/utils.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/aymurai/text/extractors/utils.py b/aymurai/text/extractors/utils.py index 0137144..8db4c66 100644 --- a/aymurai/text/extractors/utils.py +++ b/aymurai/text/extractors/utils.py @@ -74,20 +74,16 @@ def pdf_to_paragraphs( logger.debug("Extracting layout paragraphs from PDF: %s", file_path) with pymupdf.open(str(file_path)) as doc: - parsed_doc = pymupdf4llm.parse_document( + chunks = pymupdf4llm.to_text( doc, filename=str(file_path), - show_progress=False, - force_text=True, - use_ocr=False, - force_ocr=False, - ) - - chunks = parsed_doc.to_text( page_chunks=True, header=include_headers, footer=include_footers, show_progress=False, + force_text=True, + use_ocr=False, + force_ocr=False, ) paragraphs: list[str] = [] From bad66a077c308be851c2b5cb95e8eca0718e6f19 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 30 Mar 2026 12:00:04 +0000 Subject: [PATCH 05/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Add=20DOCX=20and=20P?= =?UTF-8?q?DF=20anonymizer=20modules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Implemented DocxAnonymizer class to handle anonymization of DOCX documents by replacing sensitive data with label tokens. This includes functionality for unzipping documents, parsing XML, editing content, and adding watermarks. - Developed PdfAnonymizer class for anonymizing PDF documents, utilizing pymupdf for document manipulation. This includes layout parsing, font caching, redaction operations, and watermarking. --- aymurai/text/anonymization/__init__.py | 18 +- aymurai/text/anonymization/base.py | 79 ++ .../{doc_anonymizer.py => docx.py} | 62 +- aymurai/text/anonymization/pdf.py | 1258 +++++++++++++++++ 4 files changed, 1385 insertions(+), 32 deletions(-) create mode 100644 aymurai/text/anonymization/base.py rename aymurai/text/anonymization/{doc_anonymizer.py => docx.py} (55%) create mode 100644 aymurai/text/anonymization/pdf.py diff --git a/aymurai/text/anonymization/__init__.py b/aymurai/text/anonymization/__init__.py index 7f839a9..51f3a65 100644 --- a/aymurai/text/anonymization/__init__.py +++ b/aymurai/text/anonymization/__init__.py @@ -1,7 +1,21 @@ from aymurai.text.anonymization.alignment import replace_labels_in_text -from aymurai.text.anonymization.doc_anonymizer import DocAnonymizer +from aymurai.text.anonymization.base import ( + BaseAnonymizer, + InvalidDocumentAnonymizer, + get_anonymizer, + register_anonymizer, + supported_extensions, +) +from aymurai.text.anonymization.docx import DocxAnonymizer +from aymurai.text.anonymization.pdf import PdfAnonymizer __all__ = [ - "DocAnonymizer", + "BaseAnonymizer", + "DocxAnonymizer", + "PdfAnonymizer", + "InvalidDocumentAnonymizer", + "get_anonymizer", + "register_anonymizer", + "supported_extensions", "replace_labels_in_text", ] diff --git a/aymurai/text/anonymization/base.py b/aymurai/text/anonymization/base.py new file mode 100644 index 0000000..a163115 --- /dev/null +++ b/aymurai/text/anonymization/base.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any + + +class InvalidDocumentAnonymizer(Exception): + """Raised when an anonymizer receives an invalid or unsupported document.""" + + +class BaseAnonymizer(ABC): + """Common interface shared by all document anonymizers.""" + + extension: str + + @property + def __name__(self) -> str: + return self.__class__.__name__ + + def ensure_file(self, path: Path) -> Path: + if not path.exists(): + raise InvalidDocumentAnonymizer(f"Invalid path: {path}") + return path + + def __call__( + self, + item: dict, + preds: list[dict], + output_dir: str = ".", + render_context: dict[str, Any] | None = None, + ) -> str: + return self.anonymize(item, preds, output_dir, render_context=render_context) + + @abstractmethod + def anonymize( + self, + item: dict, + preds: list[dict], + output_dir: str = ".", + render_context: dict[str, Any] | None = None, + ) -> str: + """Anonymize a document and return the output path.""" + + +_REGISTRY: dict[str, type[BaseAnonymizer]] = {} + + +def register_anonymizer(cls: type[BaseAnonymizer]) -> type[BaseAnonymizer]: + extension = getattr(cls, "extension", None) + if not extension: + raise ValueError( + f"Anonymizer {cls.__name__} must define an 'extension' attribute" + ) + + _REGISTRY[extension.lower()] = cls + return cls + + +def get_anonymizer(extension: str) -> BaseAnonymizer: + normalized = extension.lower() + try: + anonymizer_cls = _REGISTRY[normalized] + except KeyError as exc: + raise ValueError(f"Unsupported extension: {extension}") from exc + return anonymizer_cls() + + +def supported_extensions() -> set[str]: + return set(_REGISTRY.keys()) + + +__all__ = [ + "BaseAnonymizer", + "InvalidDocumentAnonymizer", + "get_anonymizer", + "register_anonymizer", + "supported_extensions", +] diff --git a/aymurai/text/anonymization/doc_anonymizer.py b/aymurai/text/anonymization/docx.py similarity index 55% rename from aymurai/text/anonymization/doc_anonymizer.py rename to aymurai/text/anonymization/docx.py index 7feb6f3..1ea3b95 100644 --- a/aymurai/text/anonymization/doc_anonymizer.py +++ b/aymurai/text/anonymization/docx.py @@ -1,14 +1,20 @@ import os import tempfile from glob import glob +from pathlib import Path +from typing import Any from more_itertools import flatten -from aymurai.meta.pipeline_interfaces import Transform from aymurai.text.anonymization.alignment import ( index_paragraphs, match_paragraphs_with_predictions, ) +from aymurai.text.anonymization.base import ( + BaseAnonymizer, + InvalidDocumentAnonymizer, + register_anonymizer, +) from aymurai.text.anonymization.watermarks import add_footer_watermark from aymurai.text.anonymization.xml_docx import ( create_docx, @@ -18,43 +24,40 @@ from aymurai.utils.cache import cache_load, cache_save, get_cache_key -class DocAnonymizer(Transform): +@register_anonymizer +class DocxAnonymizer(BaseAnonymizer): """ - Anonymize document by replacing sensitive data with label tokens + Anonymize DOCX documents by replacing sensitive data with label tokens. """ + extension = "docx" + def __init__(self, use_cache: bool = False): self.use_cache = use_cache - self.render_context = None - - def __call__(self, item: dict, preds: list[dict], output_dir: str = ".") -> None: - """ - Performs the anonymization process on a document. - Args: - item (dict): The document item to be anonymized. - preds (list[dict]): The list of predictions for the document. - output_dir (str, optional): The directory to save the anonymized document. - Defaults to ".". + def anonymize( + self, + item: dict, + preds: list[dict], + output_dir: str = ".", + render_context: dict[str, Any] | None = None, + ) -> str: + item_path = Path(item["path"]) + file_path = self.ensure_file(item_path) - Raises: - ValueError: If the document has an extension other than `.docx`. - """ - item_path = item["path"] - - if not os.path.splitext(item_path)[-1] == ".docx": - raise ValueError("Only `.docx` extension is allowed.") + if file_path.suffix.lower() != ".docx": + raise InvalidDocumentAnonymizer("Only `.docx` extension is allowed.") if not item.get("data"): item["data"] = {} - cache_key = get_cache_key(item_path, self.__name__) + cache_key = get_cache_key(str(file_path), self.__name__) if self.use_cache and (cache_data := cache_load(key=cache_key)): paragraphs = cache_data else: # Unzip document into a temporary directory with tempfile.TemporaryDirectory() as tempdir: - unzip_document(item_path, tempdir) + unzip_document(str(file_path), tempdir) # Parse XML files xml_files = glob(f"{tempdir}/**/*.xml", recursive=True) @@ -67,22 +70,21 @@ def __call__(self, item: dict, preds: list[dict], output_dir: str = ".") -> None for paragraph in paragraphs if paragraph["plain_text"].strip() ] - # Matching paragraphs = match_paragraphs_with_predictions(paragraphs, preds) - # Edit XML filess - replace_text_in_xml(paragraphs, tempdir, self.render_context) + # Edit XML files + replace_text_in_xml(paragraphs, tempdir, render_context) # Recreate anonymized document os.makedirs(output_dir, exist_ok=True) - create_docx( - tempdir, - f"{output_dir}/{os.path.basename(item_path)}", - ) + output_path = f"{output_dir}/{os.path.basename(str(file_path))}" + create_docx(tempdir, output_path) # Add watermark to the footer - add_footer_watermark(f"{output_dir}/{os.path.basename(item_path)}") + add_footer_watermark(output_path) if self.use_cache: cache_save(paragraphs, key=cache_key) + + return f"{output_dir}/{os.path.basename(str(file_path))}" diff --git a/aymurai/text/anonymization/pdf.py b/aymurai/text/anonymization/pdf.py new file mode 100644 index 0000000..d9b9503 --- /dev/null +++ b/aymurai/text/anonymization/pdf.py @@ -0,0 +1,1258 @@ +from __future__ import annotations + +import os +import re +from copy import deepcopy +from pathlib import Path +from typing import Any +from unicodedata import normalize + +import pymupdf +import pymupdf.layout # noqa: F401 # activates layout support +from jiwer import cer +from pymupdf4llm.helpers import document_layout as pymupdf4llm_document_layout + +from aymurai.logger import get_logger +from aymurai.text.anonymization.alignment import resolve_render_token +from aymurai.text.anonymization.base import ( + BaseAnonymizer, + InvalidDocumentAnonymizer, + register_anonymizer, +) + +logger = get_logger(__name__) + +WATERMARK_TEXT = "Documento anonimizado por AymurAI | https://www.aymurai.info/" + +TEXT_FLAG_ITALIC = 2 +TEXT_FLAG_SERIF = 4 +TEXT_FLAG_MONOSPACED = 8 +TEXT_FLAG_BOLD = 16 +PDF_TAG_MIN_FONT_SIZE = 7.0 +PDF_TAG_FONT_STEP = 0.5 +PDF_TAG_MAX_ABBREVIATION = 3 +PDF_TAG_RECT_X_PADDING = 2.0 +PDF_TAG_RECT_Y_PADDING = 0.75 +PDF_TAG_RECT_INSET = 0.5 +PDF_TAG_RECT_GAP_FACTOR = 0.5 +PDF_TAG_RECT_GAP_MIN = 3.0 +PDF_TAG_RECT_GAP_MAX = 8.0 + +# Vertical overlap ratio required to consider two image rects as matching +_IMAGE_OVERLAP_THRESHOLD = 0.3 + + +def _line_text(line: dict) -> str: + return "".join(span.get("text", "") for span in line.get("spans", [])) + + +def _rect_tuple(value: Any) -> tuple[float, float, float, float]: + if isinstance(value, pymupdf.Rect): + return (float(value.x0), float(value.y0), float(value.x1), float(value.y1)) + if isinstance(value, (list, tuple)) and len(value) == 4: + return (float(value[0]), float(value[1]), float(value[2]), float(value[3])) + raise ValueError(f"Invalid rectangle value: {value}") + + +def _default_style(fallback_size: float = 10.0) -> dict[str, Any]: + return { + "font": "", + "flags": 0, + "color": (0.0, 0.0, 0.0), + "size": fallback_size, + "ascender": 0.8, + "descender": -0.2, + } + + +def _span_text_weight(span: dict) -> tuple[int, float]: + text = str(span.get("text") or "").strip() + return (len(text), float(span.get("size") or 0.0)) + + +def _pdf_color_from_span(span: dict) -> tuple[float, float, float]: + try: + return tuple( + float(value) for value in pymupdf.sRGB_to_pdf(int(span.get("color") or 0)) + ) + except Exception: + return (0.0, 0.0, 0.0) + + +def _line_style(line: dict, fallback_size: float = 10.0) -> dict[str, Any]: + spans = [ + span for span in line.get("spans") or [] if str(span.get("text") or "").strip() + ] + if not spans: + return _default_style(fallback_size) + + dominant = max(spans, key=_span_text_weight) + return { + "font": str(dominant.get("font") or ""), + "flags": int(dominant.get("flags") or 0), + "color": _pdf_color_from_span(dominant), + "size": float(dominant.get("size") or fallback_size), + "ascender": float(dominant.get("ascender") or 0.8), + "descender": float(dominant.get("descender") or -0.2), + } + + +def _font_size(line: dict, fallback: float = 10.0) -> float: + spans = line.get("spans") or [] + sizes = [float(span.get("size")) for span in spans if span.get("size")] + if not sizes: + return fallback + size = sum(sizes) / len(sizes) + return max(size * 0.9, PDF_TAG_MIN_FONT_SIZE) + + +def _style_flags(style: dict[str, Any]) -> tuple[bool, bool, bool, bool]: + flags = int(style.get("flags") or 0) + font_label = str(style.get("font") or "").lower() + + is_bold = bool(flags & TEXT_FLAG_BOLD) or "bold" in font_label + is_italic = bool(flags & TEXT_FLAG_ITALIC) or any( + token in font_label for token in ("italic", "oblique") + ) + is_mono = bool(flags & TEXT_FLAG_MONOSPACED) or any( + token in font_label for token in ("courier", "mono", "console") + ) + is_serif = bool(flags & TEXT_FLAG_SERIF) or any( + token in font_label + for token in ("times", "serif", "georgia", "garamond", "mistral") + ) + return is_bold, is_italic, is_mono, is_serif + + +def _base14_fontname_for_style(style: dict[str, Any]) -> str: + """Return a Base-14 font name based on detected style flags.""" + is_bold, is_italic, is_mono, is_serif = _style_flags(style) + + if is_mono: + family = "Courier" + elif is_serif: + family = "Times" + else: + family = "Helvetica" + + variants = { + ("Helvetica", False, False): "Helvetica", + ("Helvetica", True, False): "Helvetica-Bold", + ("Helvetica", False, True): "Helvetica-Oblique", + ("Helvetica", True, True): "Helvetica-BoldOblique", + ("Times", False, False): "Times-Roman", + ("Times", True, False): "Times-Bold", + ("Times", False, True): "Times-Italic", + ("Times", True, True): "Times-BoldItalic", + ("Courier", False, False): "Courier", + ("Courier", True, False): "Courier-Bold", + ("Courier", False, True): "Courier-Oblique", + ("Courier", True, True): "Courier-BoldOblique", + } + return variants[(family, is_bold, is_italic)] + + +class _FontCache: + """Extracts and caches original fonts from the PDF so replacement text + preserves the exact original typeface whenever possible. + + Fonts are embedded into each page on first use via ``insert_font`` so that + ``insert_textbox`` / ``insert_text`` can reference them by name. + """ + + def __init__(self, doc: pymupdf.Document) -> None: + self._doc = doc + # font_name -> font buffer (bytes) + self._buffers: dict[str, bytes] = {} + # font_name -> registered insertion name for insert_text/insert_textbox + self._registered: dict[str, str] = {} + # page_index -> set of already-inserted font names + self._page_fonts: dict[int, set[str]] = {} + + self._extract_all_fonts() + + # ------------------------------------------------------------------ + def _extract_all_fonts(self) -> None: + """Walk every page and extract font buffers by xref.""" + seen_xrefs: set[int] = set() + for page_idx in range(len(self._doc)): + for font_entry in self._doc.get_page_fonts(page_idx, full=True): + xref = font_entry[0] + if xref in seen_xrefs: + continue + seen_xrefs.add(xref) + + name, ext, _ftype, content = self._doc.extract_font(xref) + if not content or not name: + continue + # Normalise name (some fonts carry subset prefixes like ABCDEF+) + clean = name.split("+")[-1] if "+" in name else name + if clean not in self._buffers: + self._buffers[clean] = content + logger.debug( + "FontCache: extracted '%s' (%d bytes)", clean, len(content) + ) + + # ------------------------------------------------------------------ + def resolve(self, style: dict[str, Any], page: pymupdf.Page) -> str: + """Return the best font name to use for *style* on *page*. + + If the original font can be recovered from the document it is + re-embedded into the page and its name is returned. Otherwise a + Base-14 fallback is returned. + """ + original_font = str(style.get("font") or "") + # Strip subset prefix (e.g. BCDEEE+ArialMT -> ArialMT) + clean = original_font.split("+")[-1] if "+" in original_font else original_font + + if clean and clean in self._buffers: + return self._ensure_on_page(clean, page) + + # Try a looser match (case-insensitive, ignoring commas, hyphens, spaces) + normalised = self._normalise_key(clean) + if normalised: + # Exact normalised match + for cached_name in self._buffers: + if self._normalise_key(cached_name) == normalised: + return self._ensure_on_page(cached_name, page) + + # Prefix / contains match (e.g. span says "LiberationSansNarrow" + # but cached name is "Liberation Sans Narrow Regular") + for cached_name in self._buffers: + cached_norm = self._normalise_key(cached_name) + if cached_norm.startswith(normalised) or normalised.startswith( + cached_norm + ): + return self._ensure_on_page(cached_name, page) + + # Fallback to Base-14 + return _base14_fontname_for_style(style) + + # ------------------------------------------------------------------ + def _ensure_on_page(self, font_name: str, page: pymupdf.Page) -> str: + """Register the font on *page* if not already done.""" + page_idx = page.number + if page_idx not in self._page_fonts: + self._page_fonts[page_idx] = set() + + # Derive a short insertion name from the font (must start with /) + insert_name = self._registered.get(font_name) + if insert_name is None: + # sanitise: keep only alnum + safe = re.sub(r"[^A-Za-z0-9]", "", font_name)[:20] or "CustomFont" + insert_name = f"F_{safe}" + self._registered[font_name] = insert_name + + if font_name not in self._page_fonts[page_idx]: + try: + page.insert_font( + fontname=insert_name, + fontbuffer=self._buffers[font_name], + ) + except Exception as exc: + logger.debug("FontCache: could not insert '%s': %s", font_name, exc) + return _base14_fontname_for_style({"font": font_name}) + self._page_fonts[page_idx].add(font_name) + + return insert_name + + # ------------------------------------------------------------------ + @staticmethod + def _normalise_key(name: str) -> str: + return re.sub(r"[\-,_ ]", "", name).lower() + + +def _build_flexible_pattern(text: str) -> str: + tokens = [re.escape(tok) for tok in re.split(r"\s+", text.strip()) if tok] + return r"\s+".join(tokens) + + +def _find_flexible( + haystack: str, + needle: str, + start: int = 0, +) -> tuple[int, int] | None: + if not needle: + return None + + idx = haystack.find(needle, start) + if idx >= 0: + return idx, idx + len(needle) + + pattern = _build_flexible_pattern(needle) + if not pattern: + return None + + match = re.search(pattern, haystack[start:]) + if match: + return start + match.start(), start + match.end() + + if start > 0: + match = re.search(pattern, haystack) + if match: + return match.start(), match.end() + + return None + + +def _label_start(label: dict) -> int: + attrs = label.get("attrs") or {} + alt = attrs.get("aymurai_alt_start_char") + start = label.get("start_char") + return int(alt if alt is not None else (start or 0)) + + +def _label_end(label: dict) -> int: + attrs = label.get("attrs") or {} + alt = attrs.get("aymurai_alt_end_char") + end = label.get("end_char") + return int(alt if alt is not None else (end or 0)) + + +def _label_surface_text(label: dict, document: str) -> str: + attrs = label.get("attrs") or {} + alt_text = attrs.get("aymurai_alt_text") + if alt_text: + return str(alt_text) + + start = _label_start(label) + end = _label_end(label) + if 0 <= start < end <= len(document): + return document[start:end] + + text = label.get("text") + return str(text) if text else "" + + +def _same_boundary_candidate(left: dict, right: dict) -> bool: + left_attrs = left.get("attrs") or {} + right_attrs = right.get("attrs") or {} + + if left_attrs.get("aymurai_label") != right_attrs.get("aymurai_label"): + return False + + left_cid = left_attrs.get("canonical_entity_id") + right_cid = right_attrs.get("canonical_entity_id") + if left_cid and right_cid and str(left_cid) != str(right_cid): + return False + + left_text = str(left.get("text") or "").strip() + right_text = str(right.get("text") or "").strip() + return bool(left_text and right_text) + + +def _resolve_token(label: dict, render_context: dict[str, Any] | None) -> str: + boundary_token = label.get("_boundary_token") + if boundary_token: + return boundary_token + + token = resolve_render_token(label, render_context) + return token or "ENT" + + +def _token_parts(token: str) -> tuple[str, str | None]: + match = re.match(r"^(.*?)(?:_(\d+))?$", token) + if not match: + normalized = token.strip() or "ENT" + return normalized, None + + base = match.group(1).strip() or "ENT" + suffix = match.group(2) + return base, suffix + + +def _abbreviate_token(base: str, length: int) -> str: + normalized = "".join(char for char in base.upper() if char.isalnum()) + if not normalized: + normalized = "ENT" + return normalized[:length] or normalized[:1] or "E" + + +def _build_display_token_candidates(token: str) -> list[str]: + base, suffix = _token_parts(token.upper()) + candidates: list[str] = [] + + def add(value: str) -> None: + if value and value not in candidates: + candidates.append(value) + + if suffix: + add(f"<{base}_{suffix}>") + add(f"<{base}>") + + for length in (PDF_TAG_MAX_ABBREVIATION, 1): + abbreviated = _abbreviate_token(base, length) + if suffix: + add(f"<{abbreviated}_{suffix}>") + add(f"<{abbreviated}>") + + return candidates + + +def _iter_font_sizes(start_size: float) -> list[float]: + if start_size <= 0: + return [] + + sizes: list[float] = [start_size] + current = start_size + while current - PDF_TAG_FONT_STEP >= PDF_TAG_MIN_FONT_SIZE - 1e-6: + current = round(current - PDF_TAG_FONT_STEP, 2) + if current not in sizes: + sizes.append(current) + + return sizes + + +def _fit_display_token( + token: str, + rect: pymupdf.Rect, + fontname: str, + base_font_size: float, + font_obj: pymupdf.Font | None = None, +) -> tuple[str | None, float | None]: + """Find the best display candidate that fits inside *rect*. + + When *font_obj* is provided its ``text_length`` method is used for pixel- + accurate measurement; otherwise the Base-14 ``pymupdf.get_text_length`` + function is used as a fallback. + """ + if rect.width <= 0 or rect.height <= 0: + return None, None + + available_width = max(rect.width - (2 * PDF_TAG_RECT_INSET), 1.0) + start_size = min(base_font_size, max(rect.height - 1.0, 1.0)) + if start_size < 1.0: + return None, None + + def _measure(text: str, size: float) -> float: + if font_obj is not None: + try: + return font_obj.text_length(text, fontsize=size) + except Exception: + pass + return pymupdf.get_text_length(text, fontname=fontname, fontsize=size) + + for size in _iter_font_sizes(start_size): + for candidate in _build_display_token_candidates(token): + if _measure(candidate, size) <= available_width + 0.1: + return candidate, size + + return None, None + + +def _make_font_obj( + font_cache: _FontCache | None, style: dict[str, Any] +) -> pymupdf.Font | None: + """Try to build a ``pymupdf.Font`` from the cached buffer for accurate + text measurement. Returns ``None`` on failure.""" + if font_cache is None: + return None + original_font = str(style.get("font") or "") + clean = original_font.split("+")[-1] if "+" in original_font else original_font + buf = font_cache._buffers.get(clean) + if not buf: + # Try normalised / prefix lookup + norm = _FontCache._normalise_key(clean) + if norm: + for cached_name, cached_buf in font_cache._buffers.items(): + cached_norm = _FontCache._normalise_key(cached_name) + if ( + cached_norm == norm + or cached_norm.startswith(norm) + or norm.startswith(cached_norm) + ): + buf = cached_buf + break + if buf: + try: + return pymupdf.Font(fontbuffer=buf) + except Exception: + pass + return None + + +def _apply_minimal_boundary_merge( + paragraphs: list[dict], + render_context: dict[str, Any] | None, +) -> None: + for left_par, right_par in zip(paragraphs, paragraphs[1:]): + left_doc = left_par.get("document") or "" + right_doc = right_par.get("document") or "" + left_labels = left_par.get("labels") or [] + right_labels = right_par.get("labels") or [] + + if not left_doc or not right_doc or not left_labels or not right_labels: + continue + + left_candidates = [ + label + for label in left_labels + if _label_end(label) >= max(0, len(left_doc) - 2) + ] + right_candidates = [label for label in right_labels if _label_start(label) <= 2] + + if not left_candidates or not right_candidates: + continue + + for left_label in left_candidates: + for right_label in right_candidates: + if not _same_boundary_candidate(left_label, right_label): + continue + + shared_token = _resolve_token(left_label, render_context) + if not shared_token: + shared_token = _resolve_token(right_label, render_context) + if shared_token: + left_label["_boundary_token"] = shared_token + right_label["_boundary_token"] = shared_token + break + + +def _build_layout_paragraphs(parsed_doc: Any) -> list[dict]: + chunks = parsed_doc.to_text( + page_chunks=True, + header=True, + footer=True, + show_progress=False, + ) + + paragraphs: list[dict] = [] + layout_index = 0 + for page_idx, (page, chunk) in enumerate(zip(parsed_doc.pages, chunks)): + page_text = chunk.get("text") or "" + page_boxes = chunk.get("page_boxes") or [] + + for box_meta in page_boxes: + box_idx = int(box_meta["index"]) + if box_idx >= len(page.boxes): + continue + + start, stop = box_meta.get("pos", (0, 0)) + box_text = normalize("NFKC", page_text[start:stop]).strip() + if not box_text: + continue + + box = page.boxes[box_idx] + line_entries: list[dict] = [] + line_text_chunks: list[str] = [] + line_cursor = 0 + + for line_idx, line in enumerate(box.textlines or []): + text = normalize("NFKC", _line_text(line)).strip() + if not text: + continue + + if line_text_chunks: + line_text_chunks.append("\n") + line_cursor += 1 + + line_start = line_cursor + line_text_chunks.append(text) + line_cursor += len(text) + line_end = line_cursor + style = _line_style(line) + + line_entries.append( + { + "page_index": page_idx, + "box_index": box_idx, + "line_index": line_idx, + "bbox": _rect_tuple(line["bbox"]), + "font_size": _font_size(line, float(style.get("size") or 10.0)), + "start": line_start, + "end": line_end, + "text": text, + "style": style, + } + ) + + line_text = "".join(line_text_chunks) + if not line_text: + continue + + paragraphs.append( + { + "plain_text": box_text, + "metadata": { + "layout_index": layout_index, + "page_index": page_idx, + "page_number": page.page_number, + "box_index": box_idx, + "boxclass": box.boxclass, + "box_bbox": ( + float(box.x0), + float(box.y0), + float(box.x1), + float(box.y1), + ), + "line_text": line_text, + "lines": line_entries, + }, + } + ) + layout_index += 1 + + return paragraphs + + +def _match_predictions_to_layout( + layout_paragraphs: list[dict], + preds: list[dict], +) -> list[dict]: + if not layout_paragraphs or not preds: + return [] + + available_indices = list(range(len(layout_paragraphs))) + all_indices = list(range(len(layout_paragraphs))) + matched: list[dict] = [] + + normalized_layout_texts = [ + normalize("NFKC", paragraph["plain_text"]).strip() + for paragraph in layout_paragraphs + ] + + for pred_idx, pred in enumerate(preds): + pred_text = normalize("NFKC", str(pred.get("document") or "")).strip() + if not pred_text: + continue + + candidate_pool = available_indices if available_indices else all_indices + exact_idx = next( + ( + idx + for idx in candidate_pool + if normalized_layout_texts[idx] == pred_text + ), + None, + ) + + if exact_idx is None: + exact_idx = min( + candidate_pool, + key=lambda idx: cer(pred_text, normalized_layout_texts[idx]), + ) + + paragraph = deepcopy(layout_paragraphs[exact_idx]) + paragraph["document"] = pred.get("document") or "" + paragraph["labels"] = pred.get("labels") or [] + paragraph["pred_index"] = pred_idx + matched.append(paragraph) + + if exact_idx in available_indices: + available_indices.remove(exact_idx) + + matched.sort(key=lambda paragraph: paragraph["metadata"]["layout_index"]) + return matched + + +def _rect_vertical_overlap(left: pymupdf.Rect, right: pymupdf.Rect) -> float: + overlap = max(0.0, min(left.y1, right.y1) - max(left.y0, right.y0)) + min_height = max(min(left.height, right.height), 1e-6) + return overlap / min_height + + +def _group_adjacent_rects( + rects: list[pymupdf.Rect], max_gap: float +) -> list[pymupdf.Rect]: + if not rects: + return [] + + ordered = sorted(rects, key=lambda rect: (rect.y0, rect.x0, rect.x1)) + groups: list[list[pymupdf.Rect]] = [[ordered[0]]] + + for rect in ordered[1:]: + previous = groups[-1][-1] + gap = rect.x0 - previous.x1 + if _rect_vertical_overlap(previous, rect) >= 0.5 and gap <= max_gap: + groups[-1].append(rect) + else: + groups.append([rect]) + + merged_rects: list[pymupdf.Rect] = [] + for group in groups: + merged = pymupdf.Rect(group[0]) + for rect in group[1:]: + merged.include_rect(rect) + merged_rects.append(merged) + + return merged_rects + + +def _pick_rect_group_for_segment( + page: pymupdf.Page, + line: dict, + text: str, + line_x_cursor: dict[tuple[int, int, int], float], +) -> pymupdf.Rect: + clip = pymupdf.Rect(line["bbox"]) + rects = [rect for rect in page.search_for(text, clip=clip) if rect.intersects(clip)] + if not rects: + return clip + + max_gap = min( + max(clip.height * PDF_TAG_RECT_GAP_FACTOR, PDF_TAG_RECT_GAP_MIN), + PDF_TAG_RECT_GAP_MAX, + ) + grouped_rects = _group_adjacent_rects(rects, max_gap=max_gap) + + line_key = (line["page_index"], line["box_index"], line["line_index"]) + min_x = line_x_cursor.get(line_key, clip.x0 - 1) + + for rect in grouped_rects: + if rect.x0 >= min_x - 0.5: + line_x_cursor[line_key] = rect.x1 + return rect + + chosen = grouped_rects[0] + line_x_cursor[line_key] = chosen.x1 + return chosen + + +def _padded_rect(rect: pymupdf.Rect, clip: pymupdf.Rect) -> pymupdf.Rect: + padded = pymupdf.Rect(rect) + pad_x = min(PDF_TAG_RECT_X_PADDING, max(rect.height * 0.2, 0.5)) + pad_y = min(PDF_TAG_RECT_Y_PADDING, max(rect.height * 0.08, 0.25)) + padded.x0 = max(clip.x0, padded.x0 - pad_x) + padded.y0 = max(clip.y0, padded.y0 - pad_y) + padded.x1 = min(clip.x1, padded.x1 + pad_x) + padded.y1 = min(clip.y1, padded.y1 + pad_y) + return padded + + +def _render_rect(rect: pymupdf.Rect) -> pymupdf.Rect: + render_rect = pymupdf.Rect(rect) + inset = min(PDF_TAG_RECT_INSET, max(render_rect.height * 0.1, 0.0)) + render_rect.x0 += inset + render_rect.x1 -= inset + if render_rect.x1 <= render_rect.x0: + render_rect = pymupdf.Rect(rect) + return render_rect + + +def _build_page_op( + rect: pymupdf.Rect, + line: dict | None, + token: str, + page: pymupdf.Page | None = None, + font_cache: _FontCache | None = None, + is_image: bool = False, +) -> dict[str, Any]: + line_clip = pymupdf.Rect(line["bbox"]) if line else pymupdf.Rect(rect) + canvas_rect = _padded_rect(rect, line_clip) + render_rect = _render_rect(canvas_rect) + style = (line or {}).get("style") or _default_style() + base_font_size = float((line or {}).get("font_size") or style.get("size") or 10.0) + + # Resolve font: prefer original font from cache, fallback to Base-14 + if font_cache is not None and page is not None: + fontname = font_cache.resolve(style, page) + else: + fontname = _base14_fontname_for_style(style) + + font_obj = _make_font_obj(font_cache, style) + + display_token, fitted_size = _fit_display_token( + token, + render_rect, + fontname, + base_font_size, + font_obj=font_obj, + ) + + if not display_token or fitted_size is None: + logger.warning( + "Could not fit PDF token '%s' inside rect=%s", + token, + tuple(round(value, 2) for value in canvas_rect), + ) + + return { + "redact_rect": canvas_rect, + "canvas_rect": canvas_rect, + "render_rect": render_rect, + "text": display_token, + "logical_token": token, + "fontname": fontname, + "fontsize": fitted_size, + "text_color": style.get("color") or (0.0, 0.0, 0.0), + "is_image": is_image, + "style": style, + } + + +def _image_rects_for_clip( + page: pymupdf.Page, + clip: pymupdf.Rect, +) -> list[pymupdf.Rect]: + """Return bounding rectangles of images that overlap *clip*.""" + rects: list[pymupdf.Rect] = [] + for img_info in page.get_image_info(): + bbox = img_info.get("bbox") + if bbox is None: + continue + img_rect = pymupdf.Rect(bbox) + if img_rect.intersects(clip) and img_rect.get_area() > 0: + rects.append(img_rect) + return rects + + +def _entity_overlaps_image( + page: pymupdf.Page, + entity_rect: pymupdf.Rect, + image_rects: list[pymupdf.Rect], +) -> pymupdf.Rect | None: + """If *entity_rect* overlaps an image return the image rect, else None.""" + for img_rect in image_rects: + overlap = _rect_vertical_overlap(entity_rect, img_rect) + if overlap >= _IMAGE_OVERLAP_THRESHOLD and entity_rect.intersects(img_rect): + return img_rect + return None + + +def _collect_page_redactions( + doc: pymupdf.Document, + paragraphs: list[dict], + render_context: dict[str, Any] | None, + font_cache: _FontCache | None = None, +) -> dict[int, list[dict]]: + page_ops: dict[int, list[dict]] = {} + line_x_cursor: dict[tuple[int, int, int], float] = {} + + # Pre-compute image rects per page + page_image_rects: dict[int, list[pymupdf.Rect]] = {} + + for paragraph in paragraphs: + metadata = paragraph.get("metadata") or {} + lines = metadata.get("lines") or [] + if not lines: + continue + + page_index = int(metadata["page_index"]) + page = doc[page_index] + line_text = metadata.get("line_text") or "" + box_clip = pymupdf.Rect(metadata.get("box_bbox") or page.rect) + document = paragraph.get("document") or "" + labels = sorted(paragraph.get("labels") or [], key=_label_start) + search_cursor = 0 + + # Lazy-load image rects for this page + if page_index not in page_image_rects: + page_image_rects[page_index] = _image_rects_for_clip(page, page.rect) + + for label in labels: + entity_text = _label_surface_text(label, document).strip() + if not entity_text: + entity_text = str(label.get("text") or "").strip() + if not entity_text: + continue + + token = _resolve_token(label, render_context) + + span = _find_flexible(line_text, entity_text, start=search_cursor) + if span is None: + span = _find_flexible(line_text, entity_text, start=0) + if span is None: + # -- Fallback: direct page search -- + fallback_rects = [ + rect + for rect in page.search_for(entity_text, clip=box_clip) + if rect.intersects(box_clip) + ] + + # Check if this is an image-based entity + if not fallback_rects: + img_match = _try_image_entity( + page, + entity_text, + box_clip, + page_image_rects[page_index], + ) + if img_match is not None: + op = _build_page_op( + img_match, + lines[0] if lines else None, + token, + page=page, + font_cache=font_cache, + is_image=True, + ) + op["image_rect"] = img_match + page_ops.setdefault(page_index, []).append(op) + continue + + if fallback_rects: + grouped_rects = _group_adjacent_rects( + fallback_rects, max_gap=PDF_TAG_RECT_GAP_MAX + ) + fallback_line = lines[0] if lines else None + + # Check if any of these rects overlap an image + for rect in grouped_rects: + img_rect = _entity_overlaps_image( + page, + rect, + page_image_rects[page_index], + ) + op = _build_page_op( + rect, + fallback_line, + token, + page=page, + font_cache=font_cache, + is_image=(img_rect is not None), + ) + if img_rect is not None: + op["image_rect"] = img_rect + page_ops.setdefault(page_index, []).append(op) + continue + + logger.warning( + "Could not map label '%s' on page=%s box=%s", + entity_text, + metadata.get("page_number"), + metadata.get("box_index"), + ) + continue + + search_cursor = span[1] + + # Collect line segments this entity spans + segments: list[tuple[dict, str, pymupdf.Rect]] = [] + for line in lines: + overlap_start = max(span[0], line["start"]) + overlap_end = min(span[1], line["end"]) + if overlap_end <= overlap_start: + continue + + segment_text = line_text[overlap_start:overlap_end].strip() + if not segment_text: + continue + + rect = _pick_rect_group_for_segment( + page, line, segment_text, line_x_cursor + ) + + # Check for image overlap + img_rect = _entity_overlaps_image( + page, + rect, + page_image_rects[page_index], + ) + segments.append((line, segment_text, rect, img_rect)) + + if not segments: + continue + + if len(segments) == 1: + # Single-line entity: write the full token + line, _seg_text, rect, img_rect = segments[0] + op = _build_page_op( + rect, + line, + token, + page=page, + font_cache=font_cache, + is_image=(img_rect is not None), + ) + if img_rect is not None: + op["image_rect"] = img_rect + page_ops.setdefault(page_index, []).append(op) + else: + # Multi-line entity: write the token centered on the + # WIDEST segment only; blank the other segments. + widest_idx = max( + range(len(segments)), + key=lambda i: segments[i][2].width, + ) + any_image = any(seg[3] is not None for seg in segments) + + for seg_idx, (seg_line, _seg_text, seg_rect, seg_img) in enumerate( + segments + ): + if seg_idx == widest_idx: + # Primary segment: render the token here + op = _build_page_op( + seg_rect, + seg_line, + token, + page=page, + font_cache=font_cache, + is_image=any_image, + ) + if seg_img is not None: + op["image_rect"] = seg_img + else: + # Secondary segment: just blank it (no text) + op = _build_page_op( + seg_rect, + seg_line, + token, + page=page, + font_cache=font_cache, + is_image=(seg_img is not None), + ) + op["text"] = None # suppress text rendering + op["fontsize"] = None + if seg_img is not None: + op["image_rect"] = seg_img + + page_ops.setdefault(page_index, []).append(op) + + return page_ops + + +def _try_image_entity( + page: pymupdf.Page, + entity_text: str, + clip: pymupdf.Rect, + image_rects: list[pymupdf.Rect], +) -> pymupdf.Rect | None: + """When text search fails, check whether the entity region corresponds to + an image in the PDF (e.g. a scanned signature or stamp). If an image + overlaps the *clip* area, return its bounding rect so we can blank it. + + We try to locate the entity text on the page (ignoring clip) first: + if the text is found near an image, that image is the match. + Otherwise we fall back to returning the image with the best spatial + overlap with *clip*. + """ + if not image_rects: + return None + + # Try unclipped text search — the entity might be rendered as real text + # on top of (or near) an image. + text_hits = page.search_for(entity_text) + if text_hits: + for hit_rect in text_hits: + for img_rect in image_rects: + if hit_rect.intersects(img_rect): + return img_rect + + # Fallback: pick the image whose intersection with *clip* is largest + best: pymupdf.Rect | None = None + best_area = 0.0 + for img_rect in image_rects: + if not img_rect.intersects(clip) or img_rect.get_area() <= 0: + continue + intersection = img_rect & clip + area = intersection.get_area() + if area > best_area: + best_area = area + best = img_rect + + return best + + +def _apply_redactions( + doc: pymupdf.Document, + page_ops: dict[int, list[dict]], + font_cache: _FontCache | None = None, +) -> None: + for page_idx, ops in page_ops.items(): + page = doc[page_idx] + + # 1) Add text redaction annotations (non-image ops only). + # Image entities are handled separately with white-rect overlay + # to avoid PDF_REDACT_IMAGE_REMOVE which destroys ALL images on + # the page. + for op in ops: + if not op.get("is_image"): + page.add_redact_annot( + op["redact_rect"], + text=None, + fill=(1, 1, 1), + cross_out=False, + ) + + # 2) Apply text redactions (images are never touched here) + page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_NONE, + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, + text=pymupdf.PDF_REDACT_TEXT_REMOVE, + ) + + # 3) Draw white canvas + centered replacement text + for op in ops: + is_image = op.get("is_image", False) + + if is_image: + # For image entities, paint a white rect that covers the + # FULL image bounding box (not just the entity text rect) + # so the original content is completely hidden. + img_rect = op.get("image_rect") + if img_rect is not None: + page.draw_rect( + img_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + + # Always white-out the canvas area (text or image) + canvas = op["canvas_rect"] + page.draw_rect( + canvas, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + + if not op.get("text") or not op.get("fontsize"): + continue + + render = op["render_rect"] + style = op.get("style") or {} + + # --- Text insertion strategy --- + # ``page.insert_textbox`` / ``insert_text`` do NOT support fonts + # registered via ``page.insert_font`` — they only understand + # Base-14 names or ``fontfile`` paths. We therefore use + # ``TextWriter.fill_textbox`` which accepts a ``pymupdf.Font`` + # object built directly from the cached buffer, giving us both + # correct typeface and native center alignment. + + written = False + + # Attempt 1: TextWriter with original font buffer + if font_cache is not None and not written: + font_obj = _make_font_obj(font_cache, style) + if font_obj is not None: + try: + tw = pymupdf.TextWriter(page.rect, color=op["text_color"]) + tw.fill_textbox( + render, + op["text"], + font=font_obj, + fontsize=op["fontsize"], + align=pymupdf.TEXT_ALIGN_CENTER, + ) + tw.write_text(page, overlay=True) + written = True + except Exception as exc: + logger.debug( + "TextWriter failed for '%s': %s", + op["text"], + exc, + ) + + # Attempt 2: insert_textbox with Base-14 fallback font + if not written: + base14 = _base14_fontname_for_style(style) + try: + page.insert_textbox( + render, + op["text"], + fontname=base14, + fontsize=op["fontsize"], + color=op["text_color"], + align=pymupdf.TEXT_ALIGN_CENTER, + overlay=True, + ) + written = True + except Exception as exc: + logger.debug( + "insert_textbox (Base-14) failed for '%s': %s", + op["text"], + exc, + ) + + # Attempt 3: insert_text centered with Base-14 + if not written: + base14 = _base14_fontname_for_style(style) + try: + descender = 0.2 + baseline_y = render.y1 - (descender * op["fontsize"]) + baseline_y = min( + max(baseline_y, render.y0 + 1.0), + render.y1 - 0.25, + ) + text_w = pymupdf.get_text_length( + op["text"], + fontname=base14, + fontsize=op["fontsize"], + ) + x_start = render.x0 + max((render.width - text_w) / 2.0, 0.0) + page.insert_text( + (x_start, baseline_y), + op["text"], + fontname=base14, + fontsize=op["fontsize"], + color=op["text_color"], + overlay=True, + ) + except Exception as exc: + logger.warning( + "All text insertion methods failed for '%s': %s", + op["text"], + exc, + ) + + +def _add_footer_watermark(doc: pymupdf.Document) -> None: + for page in doc: + text_width = pymupdf.get_text_length( + WATERMARK_TEXT, + fontname="helv", + fontsize=8, + ) + x_pos = max(24.0, page.rect.width - text_width - 24.0) + y_pos = page.rect.height - 12.0 + page.insert_text( + (x_pos, y_pos), + WATERMARK_TEXT, + fontsize=8, + fontname="helv", + color=(0.72, 0.72, 0.72), + ) + + +@register_anonymizer +class PdfAnonymizer(BaseAnonymizer): + extension = "pdf" + + def anonymize( + self, + item: dict, + preds: list[dict], + output_dir: str = ".", + render_context: dict[str, Any] | None = None, + ) -> str: + item_path = Path(item["path"]) + file_path = self.ensure_file(item_path) + + if file_path.suffix.lower() != ".pdf": + raise InvalidDocumentAnonymizer("Only `.pdf` extension is allowed.") + + with pymupdf.open(str(file_path)) as doc: + parsed_doc = pymupdf4llm_document_layout.parse_document( + doc, + filename=str(file_path), + show_progress=False, + force_text=True, + use_ocr=False, + force_ocr=False, + ) + + # Build font cache to preserve original typefaces + font_cache = _FontCache(doc) + + layout_paragraphs = _build_layout_paragraphs(parsed_doc) + matched_paragraphs = _match_predictions_to_layout(layout_paragraphs, preds) + + _apply_minimal_boundary_merge(matched_paragraphs, render_context) + page_ops = _collect_page_redactions( + doc, + matched_paragraphs, + render_context, + font_cache=font_cache, + ) + _apply_redactions(doc, page_ops, font_cache=font_cache) + _add_footer_watermark(doc) + + os.makedirs(output_dir, exist_ok=True) + output_path = Path(output_dir) / f"{file_path.stem}.anonymized.pdf" + doc.save(str(output_path)) + + return str(output_path) From 8759a7909e1804452610ecd84db3909231bf1747 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 30 Mar 2026 12:04:30 +0000 Subject: [PATCH 06/28] =?UTF-8?q?=F0=9F=94=A7=20Enhance=20PDF=20and=20DOCX?= =?UTF-8?q?=20handling=20in=20anonymization=20process?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../routers/anonymizer/anonymizer.py | 102 +++++++++--------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py index 65a3613..ca9da63 100644 --- a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py +++ b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py @@ -5,7 +5,7 @@ from threading import Lock import torch -from fastapi import Body, Depends, Form, Query, UploadFile +from fastapi import Body, Depends, Form, HTTPException, Query, UploadFile from fastapi.responses import FileResponse from fastapi.routing import APIRouter from sqlmodel import Session @@ -31,7 +31,10 @@ TextRequest, ) from aymurai.settings import settings -from aymurai.text.anonymization import DocAnonymizer, replace_labels_in_text +from aymurai.text.anonymization import ( + InvalidDocumentAnonymizer, + get_anonymizer, +) from aymurai.text.extraction import MIMETYPE_EXTENSION_MAPPER from aymurai.utils.entity_disambiguation import ( build_canonical_entities, @@ -514,11 +517,21 @@ async def anonymizer_compile_document( """ logger.info(f"receiving => {file.filename}") extension = MIMETYPE_EXTENSION_MAPPER.get(file.content_type) - logger.info(f"detection extension: {extension} ({file.content_type})") + file_suffix = os.path.splitext(file.filename or "")[1].lower() + + if extension is None and file_suffix: + extension = file_suffix.lstrip(".") + + if extension not in {"docx", "pdf"}: + raise HTTPException( + status_code=400, + detail=f"Unsupported format for anonymization: {extension or 'unknown'}", + ) + + logger.info(f"detected extension: {extension} ({file.content_type})") # Create a temporary file - _, suffix = os.path.splitext(file.filename) - suffix = suffix if suffix == ".docx" else ".txt" + suffix = f".{extension}" tmp_dir = tempfile.gettempdir() # Use delete=False to avoid the file being deleted when the NamedTemporaryFile object is closed @@ -537,7 +550,7 @@ async def anonymizer_compile_document( annots_json = json.loads(annotations) annots = DocumentAnnotations.model_validate(annots_json) - logger.info(f"processing annotations => {annots}") + effective_label_policies = _merge_label_policies(annots.label_policies) effective_render_policy = _merge_render_policy(annots.render_policy) @@ -562,9 +575,6 @@ async def anonymizer_compile_document( override=False, ) - # Anonymize the document - doc_anonymizer = DocAnonymizer() - filtered_annotations = [] for paragraph in annots.data: filtered_labels = [ @@ -583,39 +593,36 @@ async def anonymizer_compile_document( filtered_annotations, effective_render_policy, effective_label_policies ) - if suffix == ".docx": - item = {"path": tmp_filename} - doc_anonymizer.render_context = render_context - doc_anonymizer( - item, - [ - document_information.model_dump() - for document_information in filtered_annotations - ], + preds = [ + document_information.model_dump() + for document_information in filtered_annotations + ] + + try: + anonymizer = get_anonymizer(extension) + anonymized_path = anonymizer( + {"path": tmp_filename}, + preds, tmp_dir, + render_context=render_context, + ) + except (ValueError, InvalidDocumentAnonymizer) as exc: + if os.path.exists(tmp_filename): + os.remove(tmp_filename) + raise HTTPException(status_code=400, detail=str(exc)) from exc + + if extension == "pdf": + if os.path.exists(tmp_filename): + os.remove(tmp_filename) + + return FileResponse( + anonymized_path, + background=BackgroundTask(os.remove, anonymized_path), + media_type="application/pdf", + filename=f"{os.path.splitext(file.filename)[0]}.pdf", ) - logger.info(f"saved temp file on local storage => {tmp_filename}") - - else: - # Export as raw document - anonymized_doc = [ - replace_labels_in_text( - document_information.model_dump(), - render_context=render_context, - ) - .replace("<", "<") - .replace(">", ">") - for document_information in filtered_annotations - ] - with open(tmp_filename, "w") as f: - f.write("\n".join(anonymized_doc)) - - # Add watermark to the end of the document - f.write( - "\n\nDocumento anonimizado por AymurAI\n\nhttps://www.aymurai.info/" - ) - # Convert to ODT + # DOCX flow keeps ODT output cmd = [ settings.LIBREOFFICE_BIN, "--headless", @@ -623,9 +630,8 @@ async def anonymizer_compile_document( "odt", "--outdir", tmp_dir, - tmp_filename, + anonymized_path, ] - logger.info(f"Executing: {' '.join(cmd)}") try: @@ -633,20 +639,20 @@ async def anonymizer_compile_document( cmd, shell=False, encoding="utf-8", errors="ignore" ) logger.info(f"LibreOffice output: {output}") - except subprocess.CalledProcessError as e: + except subprocess.CalledProcessError as exc: raise RuntimeError( - f"LibreOffice conversion failed: {e.output.decode('utf-8', errors='ignore')}" - ) + f"LibreOffice conversion failed: {exc.output.decode('utf-8', errors='ignore')}" + ) from exc + finally: + if os.path.exists(tmp_filename): + os.remove(tmp_filename) - odt = tmp_filename.replace(suffix, ".odt") + odt = f"{os.path.splitext(anonymized_path)[0]}.odt" logger.info(f"Expected output file path: {odt}") if not os.path.exists(odt): raise RuntimeError(f"File at path {odt} does not exist.") - # Ensure the temporary file is deleted - os.remove(tmp_filename) - return FileResponse( odt, background=BackgroundTask(os.remove, odt), From c608750da8e23baf6d6de746322d9cbd769e12ab Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 30 Mar 2026 12:05:08 +0000 Subject: [PATCH 07/28] =?UTF-8?q?=F0=9F=93=9D=20Update=20backend=20module?= =?UTF-8?q?=20references=20for=20document=20rendering=20in=20README?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/es/pipelines/anonymizer/README.md | 2 +- docs/pipelines/anonymizer/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/es/pipelines/anonymizer/README.md b/docs/es/pipelines/anonymizer/README.md index 2d24161..f7f7185 100644 --- a/docs/es/pipelines/anonymizer/README.md +++ b/docs/es/pipelines/anonymizer/README.md @@ -47,7 +47,7 @@ Fuente editable: [../../../pipelines/anonymizer/pipeline.excalidraw](../../../pi ### Módulos backend relevantes - Router: `aymurai/api/endpoints/routers/anonymizer/anonymizer.py` -- Render/anonymize: `aymurai/text/anonymization/doc_anonymizer.py` +- Render/anonymize: `aymurai/text/anonymization/docx.py` and `aymurai/text/anonymization/pdf.py` - Desambiguación canónica: `aymurai/utils/entity_disambiguation/` ## Persistencia (DB) diff --git a/docs/pipelines/anonymizer/README.md b/docs/pipelines/anonymizer/README.md index 11e864e..67880ba 100644 --- a/docs/pipelines/anonymizer/README.md +++ b/docs/pipelines/anonymizer/README.md @@ -47,7 +47,7 @@ Editable source: [pipeline.excalidraw](pipeline.excalidraw) ### Core backend modules - Router: `aymurai/api/endpoints/routers/anonymizer/anonymizer.py` -- Rendering: `aymurai/text/anonymization/doc_anonymizer.py` +- Rendering: `aymurai/text/anonymization/docx.py` and `aymurai/text/anonymization/pdf.py` - Canonical entity mapping: `aymurai/utils/entity_disambiguation/` ## Persistence (DB) From 0dec42366c2aa34e0f09cd72410d8f49fd5cffb5 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 30 Mar 2026 12:06:06 +0000 Subject: [PATCH 08/28] =?UTF-8?q?=E2=9C=85=20Update=20tests=20to=20use=20D?= =?UTF-8?q?OCX=20format=20for=20document=20anonymization=20and=20enhance?= =?UTF-8?q?=20mock=20behavior?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 54a627e..87f515e 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -1,6 +1,6 @@ import json import subprocess -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest @@ -294,9 +294,18 @@ def test_should_return_validation_when_paragraph_exists(client, db_session): @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") def test_should_anonymize_document_when_annotations_are_valid( - mock_check_output, client + mock_get_anonymizer, mock_check_output, client, tmp_path ): + # Fake anonymizer that writes a dummy docx output + anonymized_path = str(tmp_path / "output.docx") + with open(anonymized_path, "wb") as f: + f.write(b"fake-docx-content") + + mock_anonymizer = MagicMock(return_value=anonymized_path) + mock_get_anonymizer.return_value = mock_anonymizer + def fake_convert(*args, **kwargs): cmd = args[0] source_path = cmd[-1] @@ -320,7 +329,13 @@ def fake_convert(*args, **kwargs): response = client.post( "/anonymizer/anonymize-document", data={"annotations": json.dumps(annotations)}, - files={"file": ("sample.txt", b"input-document", "text/plain")}, + files={ + "file": ( + "sample.docx", + b"input-document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + }, ) assert response.status_code == 200 @@ -330,9 +345,18 @@ def fake_convert(*args, **kwargs): @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") def test_should_return_500_when_anonymize_document_conversion_fails( - mock_check_output, client + mock_get_anonymizer, mock_check_output, client, tmp_path ): + # Fake anonymizer that writes a dummy output + anonymized_path = str(tmp_path / "output.docx") + with open(anonymized_path, "wb") as f: + f.write(b"fake-docx-content") + + mock_anonymizer = MagicMock(return_value=anonymized_path) + mock_get_anonymizer.return_value = mock_anonymizer + mock_check_output.side_effect = subprocess.CalledProcessError( 1, ["libreoffice"], @@ -347,7 +371,13 @@ def test_should_return_500_when_anonymize_document_conversion_fails( response = client.post( "/anonymizer/anonymize-document", data={"annotations": json.dumps(annotations)}, - files={"file": ("sample.txt", b"input-document", "text/plain")}, + files={ + "file": ( + "sample.docx", + b"input-document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + }, ) assert response.status_code == 500 From c107647b15c4eb1aee8d09514e5f25fb080c7bc6 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 30 Mar 2026 12:07:51 +0000 Subject: [PATCH 09/28] =?UTF-8?q?=E2=9C=A8=20Add=20end-to-end=20PDF=20anon?= =?UTF-8?q?ymization=20notebook=20with=20PyMuPDF=20and=20AymurAI=20API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pdf-support/06-pymupdf-layout.ipynb | 253 ++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 notebooks/experiments/pdf-support/06-pymupdf-layout.ipynb diff --git a/notebooks/experiments/pdf-support/06-pymupdf-layout.ipynb b/notebooks/experiments/pdf-support/06-pymupdf-layout.ipynb new file mode 100644 index 0000000..803c8d2 --- /dev/null +++ b/notebooks/experiments/pdf-support/06-pymupdf-layout.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1098eca1", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext rich\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "markdown", + "id": "7e81fbe5", + "metadata": {}, + "source": [ + "# End-to-End PDF Anonymization (PyMuPDF Layout + AymurAI API)\n", + "This notebook builds layout-based paragraphs from the source PDF, runs `/anonymizer/predict` + `/anonymizer/disambiguate`, and compiles an anonymized PDF.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "258fbd18", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "import pymupdf\n", + "import requests\n", + "from tqdm.auto import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fcfd985e", + "metadata": {}, + "outputs": [], + "source": [ + "# Change these values to test different documents/environments.\n", + "API_URL = \"http://localhost:8999\"\n", + "SOURCE_PDF = Path(\"./document.pdf\")\n", + "\n", + "OUTPUT_DIR = Path(\"./output\")\n", + "USE_CACHE = False\n", + "\n", + "# Optional: keep as None to rely on backend default policies.\n", + "LABEL_POLICIES = None\n", + "\n", + "# Keep aligned with current anonymizer defaults.\n", + "RENDER_POLICY = {\"suffix_mode\": \"auto\", \"suffix_threshold\": 1}\n", + "\n", + "SOURCE_PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3860b71", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_document_via_api(pdf_path: Path) -> dict:\n", + " with pdf_path.open(\"rb\") as handle:\n", + " response = requests.post(\n", + " f\"{API_URL}/document-extract\",\n", + " files={\"file\": (pdf_path.name, handle, \"application/pdf\")},\n", + " timeout=600,\n", + " )\n", + "\n", + " response.raise_for_status()\n", + " return response.json()\n", + "\n", + "\n", + "def predict_paragraph(text: str, retries: int = 2) -> dict:\n", + " last_error = None\n", + " for attempt in range(retries + 1):\n", + " try:\n", + " response = requests.post(\n", + " f\"{API_URL}/anonymizer/predict\",\n", + " json={\"text\": text},\n", + " params={\"use_cache\": USE_CACHE},\n", + " timeout=600,\n", + " )\n", + " response.raise_for_status()\n", + " return response.json()\n", + " except Exception as exc:\n", + " last_error = exc\n", + " if attempt < retries:\n", + " time.sleep(2)\n", + " else:\n", + " raise last_error\n", + "\n", + " raise RuntimeError(\"Predict request exhausted retries\")\n", + "\n", + "\n", + "def disambiguate(predictions: list[dict]) -> dict:\n", + " payload = {\"paragraphs\": predictions}\n", + " if LABEL_POLICIES is not None:\n", + " payload[\"label_policies\"] = LABEL_POLICIES\n", + "\n", + " response = requests.post(\n", + " f\"{API_URL}/anonymizer/disambiguate\",\n", + " json=payload,\n", + " timeout=600,\n", + " )\n", + " response.raise_for_status()\n", + " return response.json()\n", + "\n", + "\n", + "def compile_pdf(pdf_path: Path, annotations: dict) -> Path:\n", + " payload = {\n", + " \"data\": annotations[\"data\"],\n", + " \"render_policy\": RENDER_POLICY,\n", + " }\n", + " if annotations.get(\"label_policies\") is not None:\n", + " payload[\"label_policies\"] = annotations[\"label_policies\"]\n", + "\n", + " with pdf_path.open(\"rb\") as handle:\n", + " response = requests.post(\n", + " f\"{API_URL}/anonymizer/anonymize-document\",\n", + " data={\"annotations\": json.dumps(payload, ensure_ascii=False)},\n", + " files={\"file\": (pdf_path.name, handle, \"application/pdf\")},\n", + " timeout=1200,\n", + " )\n", + "\n", + " response.raise_for_status()\n", + "\n", + " OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + " output_path = OUTPUT_DIR / f\"{pdf_path.stem}.anonymized.pdf\"\n", + " output_path.write_bytes(response.content)\n", + " return output_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0a54485", + "metadata": {}, + "outputs": [], + "source": [ + "document_extract_payload = extract_document_via_api(SOURCE_PDF)\n", + "paragraphs = document_extract_payload[\"document\"]\n", + "\n", + "print(f\"Document ID: {document_extract_payload['document_id']}\")\n", + "print(f\"Paragraphs extracted: {len(paragraphs)}\")\n", + "\n", + "paragraphs[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3beaadee", + "metadata": {}, + "outputs": [], + "source": [ + "predictions = [\n", + " predict_paragraph(paragraph)\n", + " for paragraph in tqdm(paragraphs, desc=\"Predicting paragraphs\")\n", + "]\n", + "total_labels = sum(len(pred.get(\"labels\") or []) for pred in predictions)\n", + "print(f\"Predictions: {len(predictions)} paragraphs, {total_labels} labels\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "682760e0", + "metadata": {}, + "outputs": [], + "source": [ + "disambiguated = disambiguate(predictions)\n", + "total_labels = sum(len(pred.get(\"labels\") or []) for pred in disambiguated[\"data\"])\n", + "print(f\"Disambiguated labels: {total_labels}\")\n", + "disambiguated.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eae3f2c9", + "metadata": {}, + "outputs": [], + "source": [ + "[data for data in disambiguated[\"data\"] if data[\"labels\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "665dde4a", + "metadata": {}, + "outputs": [], + "source": [ + "output_pdf = compile_pdf(SOURCE_PDF, disambiguated)\n", + "print(output_pdf.resolve())\n", + "output_pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "715a782a", + "metadata": {}, + "outputs": [], + "source": [ + "with pymupdf.open(str(output_pdf)) as doc:\n", + " watermark_hits = sum(\n", + " len(page.search_for(\"Documento anonimizado por AymurAI\")) for page in doc\n", + " )\n", + " print(f\"Pages: {doc.page_count}\")\n", + " print(f\"Watermark hits: {watermark_hits}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a274809", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "aymurai", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.20" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From f1ac135d14c28647058f670baa9ba92eae3576fa Mon Sep 17 00:00:00 2001 From: jansaldo Date: Mon, 6 Apr 2026 13:22:46 +0000 Subject: [PATCH 10/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Rework=20PDF=20anony?= =?UTF-8?q?mization=20for=20precise=20spans=20and=20widget=20handling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/alignment.py | 55 +- aymurai/text/anonymization/pdf.py | 1198 +++++++++++++++++------ pyproject.toml | 1 + 3 files changed, 925 insertions(+), 329 deletions(-) diff --git a/aymurai/text/anonymization/alignment.py b/aymurai/text/anonymization/alignment.py index 3a6386b..21287c9 100644 --- a/aymurai/text/anonymization/alignment.py +++ b/aymurai/text/anonymization/alignment.py @@ -9,9 +9,9 @@ from joblib import hash from more_itertools import flatten +from aymurai.meta.api_interfaces import LabelPolicy from aymurai.models.flair.utils import FlairTextNormalize from aymurai.utils.alignment.core import align_text, tokenize -from aymurai.meta.api_interfaces import LabelPolicy REGEX_PARAGRAPH = r"((?.*?)(\/w:p\b)" REGEX_FRAGMENT = r"(?(?P.*?)(<.*?\/w:t)" @@ -61,6 +61,47 @@ def resolve_render_token(label: dict, render_context: dict | None = None) -> str return f"{base}_{index}" +def _label_replacement_start(label: dict) -> int: + attrs = label.get("attrs") or {} + alt_start = attrs.get("aymurai_alt_start_char") + start_char = label.get("start_char") + return int(alt_start if alt_start is not None else (start_char or 0)) + + +def _label_replacement_end(label: dict) -> int: + attrs = label.get("attrs") or {} + alt_end = attrs.get("aymurai_alt_end_char") + end_char = label.get("end_char") + return int(alt_end if alt_end is not None else (end_char or 0)) + + +def _label_replacement_text(label: dict, document: str) -> str: + attrs = label.get("attrs") or {} + + if "aymurai_alt_text" in attrs: + alt_text = attrs["aymurai_alt_text"] + return str(alt_text) if alt_text else "" + + alt_start = attrs.get("aymurai_alt_start_char") + alt_end = attrs.get("aymurai_alt_end_char") + if alt_start is not None and alt_end is not None: + start_char, end_char = int(alt_start), int(alt_end) + if 0 <= start_char < end_char <= len(document): + return document[start_char:end_char] + return "" + + if "aymurai_alt_start_char" in attrs and alt_start is None: + return "" + + start_char = int(label.get("start_char") or 0) + end_char = int(label.get("end_char") or 0) + if 0 <= start_char < end_char <= len(document): + return document[start_char:end_char] + + text = label.get("text") + return str(text) if text else "" + + def unify_consecutive_labels( sample: dict, text_key: str = "document", @@ -93,9 +134,11 @@ def unify_consecutive_labels( # Iterate over labels for label in labels: # Get attributes - text = label["attrs"]["aymurai_alt_text"] or label["text"] - start_char = label["attrs"]["aymurai_alt_start_char"] or label["start_char"] - end_char = label["attrs"]["aymurai_alt_end_char"] or label["end_char"] + text = _label_replacement_text(label, document) + start_char = _label_replacement_start(label) + end_char = _label_replacement_end(label) + if not text or end_char <= start_char: + continue aymurai_label = resolve_render_token(label, render_context) if current_group is None: @@ -115,7 +158,7 @@ def unify_consecutive_labels( else: # Finish the current group and start a new one current_group["text"] = document[ - current_group["start_char"] : current_group["end_char"] + 1 + current_group["start_char"] : current_group["end_char"] ] unified_labels.append(current_group) current_group = { @@ -128,7 +171,7 @@ def unify_consecutive_labels( # Finish the last group if current_group is not None: current_group["text"] = document[ - current_group["start_char"] : current_group["end_char"] + 1 + current_group["start_char"] : current_group["end_char"] ] unified_labels.append(current_group) diff --git a/aymurai/text/anonymization/pdf.py b/aymurai/text/anonymization/pdf.py index d9b9503..23840cf 100644 --- a/aymurai/text/anonymization/pdf.py +++ b/aymurai/text/anonymization/pdf.py @@ -7,6 +7,8 @@ from typing import Any from unicodedata import normalize +import cv2 +import numpy as np import pymupdf import pymupdf.layout # noqa: F401 # activates layout support from jiwer import cer @@ -31,8 +33,8 @@ PDF_TAG_MIN_FONT_SIZE = 7.0 PDF_TAG_FONT_STEP = 0.5 PDF_TAG_MAX_ABBREVIATION = 3 -PDF_TAG_RECT_X_PADDING = 2.0 -PDF_TAG_RECT_Y_PADDING = 0.75 +PDF_TAG_RECT_X_PADDING = 0.5 +PDF_TAG_RECT_Y_PADDING = 0.0 PDF_TAG_RECT_INSET = 0.5 PDF_TAG_RECT_GAP_FACTOR = 0.5 PDF_TAG_RECT_GAP_MIN = 3.0 @@ -41,6 +43,11 @@ # Vertical overlap ratio required to consider two image rects as matching _IMAGE_OVERLAP_THRESHOLD = 0.3 +# DPI used to rasterise PDF image regions for OpenCV editing. +_IMAGE_EDIT_DPI = 200 +_IMAGE_EDIT_MASK_DILATE = 1 +_IMAGE_EDIT_INPAINT_RADIUS = 3 + def _line_text(line: dict) -> str: return "".join(span.get("text", "") for span in line.get("spans", [])) @@ -97,6 +104,55 @@ def _line_style(line: dict, fallback_size: float = 10.0) -> dict[str, Any]: } +def _build_spans_detail(line: dict) -> tuple[list[dict], int]: + """Build per-span style info with character offsets for entity-level + style lookup. Returns ``(spans_detail, strip_offset)``.""" + raw_text = normalize("NFKC", _line_text(line)) + strip_offset = len(raw_text) - len(raw_text.lstrip()) + + spans_detail: list[dict] = [] + cursor = 0 + for span in line.get("spans", []): + span_text = normalize("NFKC", span.get("text", "")) + span_start = cursor + cursor += len(span_text) + spans_detail.append( + { + "start": span_start, + "end": cursor, + "style": { + "font": str(span.get("font") or ""), + "flags": int(span.get("flags") or 0), + "color": _pdf_color_from_span(span), + "size": float(span.get("size") or 10.0), + "ascender": float(span.get("ascender") or 0.8), + "descender": float(span.get("descender") or -0.2), + }, + } + ) + return spans_detail, strip_offset + + +def _entity_style_from_spans( + line_entry: dict, + offset_in_stripped_text: int, +) -> dict[str, Any]: + """Return the style of the span at *offset_in_stripped_text* within the + line entry's (stripped) text. Falls back to line-level dominant style.""" + spans_detail = line_entry.get("spans_detail") + if not spans_detail: + return line_entry.get("style") or _default_style() + + strip_offset = line_entry.get("strip_offset", 0) + raw_offset = offset_in_stripped_text + strip_offset + + for span_info in spans_detail: + if span_info["start"] <= raw_offset < span_info["end"]: + return span_info["style"] + + return line_entry.get("style") or _default_style() + + def _font_size(line: dict, fallback: float = 10.0) -> float: spans = line.get("spans") or [] sizes = [float(span.get("size")) for span in spans if span.get("size")] @@ -152,116 +208,6 @@ def _base14_fontname_for_style(style: dict[str, Any]) -> str: return variants[(family, is_bold, is_italic)] -class _FontCache: - """Extracts and caches original fonts from the PDF so replacement text - preserves the exact original typeface whenever possible. - - Fonts are embedded into each page on first use via ``insert_font`` so that - ``insert_textbox`` / ``insert_text`` can reference them by name. - """ - - def __init__(self, doc: pymupdf.Document) -> None: - self._doc = doc - # font_name -> font buffer (bytes) - self._buffers: dict[str, bytes] = {} - # font_name -> registered insertion name for insert_text/insert_textbox - self._registered: dict[str, str] = {} - # page_index -> set of already-inserted font names - self._page_fonts: dict[int, set[str]] = {} - - self._extract_all_fonts() - - # ------------------------------------------------------------------ - def _extract_all_fonts(self) -> None: - """Walk every page and extract font buffers by xref.""" - seen_xrefs: set[int] = set() - for page_idx in range(len(self._doc)): - for font_entry in self._doc.get_page_fonts(page_idx, full=True): - xref = font_entry[0] - if xref in seen_xrefs: - continue - seen_xrefs.add(xref) - - name, ext, _ftype, content = self._doc.extract_font(xref) - if not content or not name: - continue - # Normalise name (some fonts carry subset prefixes like ABCDEF+) - clean = name.split("+")[-1] if "+" in name else name - if clean not in self._buffers: - self._buffers[clean] = content - logger.debug( - "FontCache: extracted '%s' (%d bytes)", clean, len(content) - ) - - # ------------------------------------------------------------------ - def resolve(self, style: dict[str, Any], page: pymupdf.Page) -> str: - """Return the best font name to use for *style* on *page*. - - If the original font can be recovered from the document it is - re-embedded into the page and its name is returned. Otherwise a - Base-14 fallback is returned. - """ - original_font = str(style.get("font") or "") - # Strip subset prefix (e.g. BCDEEE+ArialMT -> ArialMT) - clean = original_font.split("+")[-1] if "+" in original_font else original_font - - if clean and clean in self._buffers: - return self._ensure_on_page(clean, page) - - # Try a looser match (case-insensitive, ignoring commas, hyphens, spaces) - normalised = self._normalise_key(clean) - if normalised: - # Exact normalised match - for cached_name in self._buffers: - if self._normalise_key(cached_name) == normalised: - return self._ensure_on_page(cached_name, page) - - # Prefix / contains match (e.g. span says "LiberationSansNarrow" - # but cached name is "Liberation Sans Narrow Regular") - for cached_name in self._buffers: - cached_norm = self._normalise_key(cached_name) - if cached_norm.startswith(normalised) or normalised.startswith( - cached_norm - ): - return self._ensure_on_page(cached_name, page) - - # Fallback to Base-14 - return _base14_fontname_for_style(style) - - # ------------------------------------------------------------------ - def _ensure_on_page(self, font_name: str, page: pymupdf.Page) -> str: - """Register the font on *page* if not already done.""" - page_idx = page.number - if page_idx not in self._page_fonts: - self._page_fonts[page_idx] = set() - - # Derive a short insertion name from the font (must start with /) - insert_name = self._registered.get(font_name) - if insert_name is None: - # sanitise: keep only alnum - safe = re.sub(r"[^A-Za-z0-9]", "", font_name)[:20] or "CustomFont" - insert_name = f"F_{safe}" - self._registered[font_name] = insert_name - - if font_name not in self._page_fonts[page_idx]: - try: - page.insert_font( - fontname=insert_name, - fontbuffer=self._buffers[font_name], - ) - except Exception as exc: - logger.debug("FontCache: could not insert '%s': %s", font_name, exc) - return _base14_fontname_for_style({"font": font_name}) - self._page_fonts[page_idx].add(font_name) - - return insert_name - - # ------------------------------------------------------------------ - @staticmethod - def _normalise_key(name: str) -> str: - return re.sub(r"[\-,_ ]", "", name).lower() - - def _build_flexible_pattern(text: str) -> str: tokens = [re.escape(tok) for tok in re.split(r"\s+", text.strip()) if tok] return r"\s+".join(tokens) @@ -311,12 +257,30 @@ def _label_end(label: dict) -> int: def _label_surface_text(label: dict, document: str) -> str: attrs = label.get("attrs") or {} - alt_text = attrs.get("aymurai_alt_text") - if alt_text: - return str(alt_text) - start = _label_start(label) - end = _label_end(label) + # Prefer explicit alt text when the key is present + if "aymurai_alt_text" in attrs: + alt_text = attrs["aymurai_alt_text"] + return str(alt_text) if alt_text else "" + + # Use alt char offsets when available + alt_start = attrs.get("aymurai_alt_start_char") + alt_end = attrs.get("aymurai_alt_end_char") + + if alt_start is not None and alt_end is not None: + start, end = int(alt_start), int(alt_end) + if 0 <= start < end <= len(document): + return document[start:end] + # Alt range is empty/invalid — alt processing cleared this label + return "" + + # If alt keys exist but values are None, alt processing cleared this label + if "aymurai_alt_start_char" in attrs and alt_start is None: + return "" + + # No alt info available; use raw char offsets + start = int(label.get("start_char") or 0) + end = int(label.get("end_char") or 0) if 0 <= start < end <= len(document): return document[start:end] @@ -440,35 +404,23 @@ def _measure(text: str, size: float) -> float: return None, None -def _make_font_obj( - font_cache: _FontCache | None, style: dict[str, Any] -) -> pymupdf.Font | None: - """Try to build a ``pymupdf.Font`` from the cached buffer for accurate - text measurement. Returns ``None`` on failure.""" - if font_cache is None: - return None - original_font = str(style.get("font") or "") - clean = original_font.split("+")[-1] if "+" in original_font else original_font - buf = font_cache._buffers.get(clean) - if not buf: - # Try normalised / prefix lookup - norm = _FontCache._normalise_key(clean) - if norm: - for cached_name, cached_buf in font_cache._buffers.items(): - cached_norm = _FontCache._normalise_key(cached_name) - if ( - cached_norm == norm - or cached_norm.startswith(norm) - or norm.startswith(cached_norm) - ): - buf = cached_buf - break - if buf: - try: - return pymupdf.Font(fontbuffer=buf) - except Exception: - pass - return None +# Cache of Base-14 pymupdf.Font objects (they are reusable and thread-safe). +_BASE14_FONT_CACHE: dict[str, pymupdf.Font] = {} + + +def _get_base14_font(style: dict[str, Any]) -> pymupdf.Font: + """Return a ``pymupdf.Font`` built from the Base-14 name that matches + *style*. The object is cached so repeated calls are essentially free. + + Base-14 fonts always contain the full Latin character set (including + ``<``, ``>``, ``_``, digits) and correctly carry bold / italic weight, + unlike subset font buffers extracted from the PDF.""" + name = _base14_fontname_for_style(style) + font = _BASE14_FONT_CACHE.get(name) + if font is None: + font = pymupdf.Font(name) + _BASE14_FONT_CACHE[name] = font + return font def _apply_minimal_boundary_merge( @@ -551,6 +503,7 @@ def _build_layout_paragraphs(parsed_doc: Any) -> list[dict]: line_cursor += len(text) line_end = line_cursor style = _line_style(line) + spans_detail, strip_offset = _build_spans_detail(line) line_entries.append( { @@ -563,6 +516,8 @@ def _build_layout_paragraphs(parsed_doc: Any) -> list[dict]: "end": line_end, "text": text, "style": style, + "spans_detail": spans_detail, + "strip_offset": strip_offset, } ) @@ -710,12 +665,10 @@ def _pick_rect_group_for_segment( def _padded_rect(rect: pymupdf.Rect, clip: pymupdf.Rect) -> pymupdf.Rect: padded = pymupdf.Rect(rect) - pad_x = min(PDF_TAG_RECT_X_PADDING, max(rect.height * 0.2, 0.5)) - pad_y = min(PDF_TAG_RECT_Y_PADDING, max(rect.height * 0.08, 0.25)) - padded.x0 = max(clip.x0, padded.x0 - pad_x) - padded.y0 = max(clip.y0, padded.y0 - pad_y) - padded.x1 = min(clip.x1, padded.x1 + pad_x) - padded.y1 = min(clip.y1, padded.y1 + pad_y) + padded.x0 = max(clip.x0, padded.x0 - PDF_TAG_RECT_X_PADDING) + padded.y0 = max(clip.y0, padded.y0 - PDF_TAG_RECT_Y_PADDING) + padded.x1 = min(clip.x1, padded.x1 + PDF_TAG_RECT_X_PADDING) + padded.y1 = min(clip.y1, padded.y1 + PDF_TAG_RECT_Y_PADDING) return padded @@ -729,27 +682,119 @@ def _render_rect(rect: pymupdf.Rect) -> pymupdf.Rect: return render_rect +def _text_redact_rect(rect: pymupdf.Rect) -> pymupdf.Rect: + redact_rect = pymupdf.Rect(rect) + edge_inset = min(0.25, max(redact_rect.width * 0.01, 0.05)) + if redact_rect.width > (2 * edge_inset): + redact_rect.x0 += edge_inset + redact_rect.x1 -= edge_inset + return redact_rect + + +def _normalize_line_chars(spans: list[dict]) -> list[dict[str, Any]]: + chars: list[dict[str, Any]] = [] + for span in spans: + for char in span.get("chars") or []: + norm_text = normalize("NFKC", str(char.get("c") or "")) + if not norm_text: + continue + bbox = pymupdf.Rect(char["bbox"]) + for norm_char in norm_text: + chars.append({"char": norm_char, "bbox": bbox}) + return chars + + +def _line_chars_from_page(page: pymupdf.Page, line: dict) -> list[dict[str, Any]]: + clip = pymupdf.Rect(line["bbox"]) + raw = page.get_text("rawdict", clip=clip) + target_text = normalize("NFKC", str(line.get("text") or "")).strip() + + best_chars: list[dict[str, Any]] = [] + best_score: tuple[float, float, float] | None = None + + for block in raw.get("blocks") or []: + if block.get("type", 0) != 0: + continue + for raw_line in block.get("lines") or []: + chars = _normalize_line_chars(raw_line.get("spans") or []) + if not chars: + continue + + candidate_rect = pymupdf.Rect(raw_line["bbox"]) + candidate_text = "".join(entry["char"] for entry in chars).strip() + overlap = ( + _rect_vertical_overlap(candidate_rect, clip) + if candidate_rect.intersects(clip) + else 0.0 + ) + text_score = 0.0 + if target_text or candidate_text: + text_score = ( + 0.0 + if target_text == candidate_text + else cer(target_text, candidate_text) + ) + bbox_score = ( + abs(candidate_rect.x0 - clip.x0) + + abs(candidate_rect.y0 - clip.y0) + + abs(candidate_rect.x1 - clip.x1) + + abs(candidate_rect.y1 - clip.y1) + ) / 100.0 + score = (1.0 - overlap, text_score, bbox_score) + if best_score is None or score < best_score: + best_score = score + best_chars = chars + + return best_chars + + +def _rect_from_char_slice( + chars: list[dict[str, Any]], + start: int, + end: int, +) -> pymupdf.Rect | None: + if not chars: + return None + + slice_start = max(int(start), 0) + slice_end = min(int(end), len(chars)) + if slice_end <= slice_start: + return None + + segment = chars[slice_start:slice_end] + if not segment: + return None + + boxes = [entry["bbox"] for entry in segment if str(entry["char"]).strip()] + if not boxes: + boxes = [entry["bbox"] for entry in segment] + if not boxes: + return None + + rect = pymupdf.Rect(boxes[0]) + for bbox in boxes[1:]: + rect.include_rect(bbox) + return rect + + def _build_page_op( rect: pymupdf.Rect, line: dict | None, token: str, - page: pymupdf.Page | None = None, - font_cache: _FontCache | None = None, is_image: bool = False, + entity_style: dict[str, Any] | None = None, ) -> dict[str, Any]: line_clip = pymupdf.Rect(line["bbox"]) if line else pymupdf.Rect(rect) canvas_rect = _padded_rect(rect, line_clip) render_rect = _render_rect(canvas_rect) - style = (line or {}).get("style") or _default_style() + style = entity_style or (line or {}).get("style") or _default_style() base_font_size = float((line or {}).get("font_size") or style.get("size") or 10.0) - # Resolve font: prefer original font from cache, fallback to Base-14 - if font_cache is not None and page is not None: - fontname = font_cache.resolve(style, page) - else: - fontname = _base14_fontname_for_style(style) - - font_obj = _make_font_obj(font_cache, style) + # Always use Base-14 fonts: they carry correct bold/italic weight and + # contain all glyphs needed for tags (<, >, _, digits, letters). + # Subset font buffers extracted from the PDF lack many of these glyphs. + fontname = _base14_fontname_for_style(style) + font_obj = _get_base14_font(style) display_token, fitted_size = _fit_display_token( token, @@ -767,15 +812,18 @@ def _build_page_op( ) return { - "redact_rect": canvas_rect, + "redact_rect": _text_redact_rect(rect), "canvas_rect": canvas_rect, "render_rect": render_rect, + "line_rect": line_clip, "text": display_token, "logical_token": token, "fontname": fontname, "fontsize": fitted_size, + "text_align": pymupdf.TEXT_ALIGN_LEFT, "text_color": style.get("color") or (0.0, 0.0, 0.0), "is_image": is_image, + "skip_background_fill": is_image, "style": style, } @@ -809,17 +857,318 @@ def _entity_overlaps_image( return None +def _widget_text_color(widget: pymupdf.Widget) -> tuple[float, float, float]: + values = list(widget.text_color or []) + if not values: + return (0.0, 0.0, 0.0) + if len(values) == 1: + shade = float(values[0]) + return (shade, shade, shade) + if len(values) >= 3: + return tuple(float(value) for value in values[:3]) + return (0.0, 0.0, 0.0) + + +def _style_from_widget(widget: pymupdf.Widget) -> dict[str, Any]: + return { + "font": str(widget.text_font or ""), + "flags": 0, + "color": _widget_text_color(widget), + "size": float(widget.text_fontsize or 10.0), + "ascender": 0.8, + "descender": -0.2, + } + + +def _page_widget_infos(page: pymupdf.Page) -> list[dict[str, Any]]: + infos: list[dict[str, Any]] = [] + for widget in page.widgets() or []: + if widget.field_type not in ( + pymupdf.PDF_WIDGET_TYPE_TEXT, + pymupdf.PDF_WIDGET_TYPE_SIGNATURE, + ): + continue + infos.append( + { + "xref": int(widget.xref), + "field_type": int(widget.field_type), + "field_name": str(widget.field_name or ""), + "field_value": str(widget.field_value or ""), + "rect": pymupdf.Rect(widget.rect), + "style": _style_from_widget(widget), + } + ) + return infos + + +def _entity_overlaps_widget( + entity_rect: pymupdf.Rect, + widget_infos: list[dict[str, Any]], +) -> dict[str, Any] | None: + best_widget: dict[str, Any] | None = None + best_area = 0.0 + for widget_info in widget_infos: + widget_rect = widget_info["rect"] + if not entity_rect.intersects(widget_rect): + continue + area = (entity_rect & widget_rect).get_area() + if area > best_area: + best_area = area + best_widget = widget_info + return best_widget + + +def _fit_widget_token( + widget_info: dict[str, Any], + current_text: str, + entity_span: tuple[int, int], + token: str, +) -> str: + style = widget_info.get("style") or _default_style() + rect = pymupdf.Rect(widget_info["rect"]) + font_obj = _get_base14_font(style) + max_width = max(rect.width - 1.0, 1.0) + + prefix = current_text[: entity_span[0]] + suffix = current_text[entity_span[1] :] + + for candidate in _build_display_token_candidates(token): + candidate_text = f"{prefix}{candidate}{suffix}" + if ( + font_obj.text_length( + candidate_text, fontsize=float(style.get("size") or 10.0) + ) + <= max_width + 0.1 + ): + return candidate + + candidates = _build_display_token_candidates(token) + return candidates[0] if candidates else f"<{token}>" + + +def _apply_widget_ops( + doc: pymupdf.Document, + widget_ops: dict[int, list[dict]], +) -> None: + for page_idx, ops in widget_ops.items(): + if not ops: + continue + + page = doc[page_idx] + widgets = { + int(widget.xref): widget + for widget in (page.widgets() or []) + if widget.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT + } + grouped: dict[int, list[dict]] = {} + for op in ops: + grouped.setdefault(int(op["widget_xref"]), []).append(op) + + for widget_xref, replacements in grouped.items(): + widget = widgets.get(widget_xref) + if widget is None: + logger.warning( + "Could not resolve PDF widget xref=%s on page=%s", + widget_xref, + page_idx, + ) + continue + + current_text = str(widget.field_value or "") + if not current_text: + continue + + search_cursor = 0 + changed = False + for replacement in replacements: + entity_text = replacement["entity_text"] + span = _find_flexible(current_text, entity_text, start=search_cursor) + if span is None: + span = _find_flexible(current_text, entity_text, start=0) + if span is None: + logger.warning( + "Could not map widget label '%s' in widget '%s' on page=%s", + entity_text, + replacement.get("field_name") or widget.field_name, + page_idx, + ) + continue + + token_text = _fit_widget_token( + replacement["widget_info"], + current_text, + span, + replacement["logical_token"], + ) + current_text = ( + f"{current_text[: span[0]]}{token_text}{current_text[span[1] :]}" + ) + search_cursor = span[0] + len(token_text) + changed = True + + if not changed: + continue + + try: + widget.field_value = current_text + widget.update() + except Exception as exc: + logger.warning( + "Failed to update PDF widget '%s' on page=%s: %s", + widget.field_name, + page_idx, + exc, + ) + + +def _apply_signature_widget_ops( + doc: pymupdf.Document, + signature_widget_ops: dict[int, list[dict]], +) -> None: + for page_idx, ops in signature_widget_ops.items(): + if not ops: + continue + + page = doc[page_idx] + widgets = { + int(widget.xref): widget + for widget in (page.widgets() or []) + if widget.field_type == pymupdf.PDF_WIDGET_TYPE_SIGNATURE + } + grouped: dict[int, list[dict]] = {} + for op in ops: + grouped.setdefault(int(op["widget_xref"]), []).append(op) + + for widget_xref, widget_group_ops in grouped.items(): + widget_rect = pymupdf.Rect(widget_group_ops[0]["widget_rect"]) + + try: + pix = page.get_pixmap( + clip=widget_rect, + matrix=pymupdf.Matrix( + _IMAGE_EDIT_DPI / 72.0, _IMAGE_EDIT_DPI / 72.0 + ), + alpha=False, + ) + except Exception as exc: + logger.warning( + "Could not rasterise signature widget xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + pix = None + + widget = widgets.get(widget_xref) + if widget is not None: + try: + page.delete_widget(widget) + except Exception as exc: + logger.warning( + "Failed to delete signature widget xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + + if pix is None: + page.draw_rect( + widget_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + else: + img = ( + np.frombuffer(pix.samples, dtype=np.uint8) + .reshape(pix.height, pix.width, pix.n) + .copy() + ) + if pix.n >= 3: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + scale = _IMAGE_EDIT_DPI / 72.0 + mask = np.zeros(img.shape[:2], dtype=np.uint8) + for op in widget_group_ops: + canvas = op["canvas_rect"] + x0 = max(int((canvas.x0 - widget_rect.x0) * scale), 0) + y0 = max(int((canvas.y0 - widget_rect.y0) * scale), 0) + x1 = min(int((canvas.x1 - widget_rect.x0) * scale), img.shape[1]) + y1 = min(int((canvas.y1 - widget_rect.y0) * scale), img.shape[0]) + if x1 <= x0 or y1 <= y0: + continue + mask[y0:y1, x0:x1] = 255 + + if np.any(mask): + if _IMAGE_EDIT_MASK_DILATE > 0: + kernel = np.ones((3, 3), dtype=np.uint8) + mask = cv2.dilate( + mask, kernel, iterations=_IMAGE_EDIT_MASK_DILATE + ) + try: + img = cv2.inpaint( + img, + mask, + _IMAGE_EDIT_INPAINT_RADIUS, + cv2.INPAINT_TELEA, + ) + except Exception as exc: + logger.warning( + "OpenCV inpaint failed for signature widget xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + img[mask > 0] = 255 + + success, png_buf = cv2.imencode(".png", img) + if success: + try: + page.insert_image( + widget_rect, stream=png_buf.tobytes(), overlay=True + ) + except Exception as exc: + logger.warning( + "Failed to insert edited signature widget image xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + page.draw_rect( + widget_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + else: + page.draw_rect( + widget_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + + for op in widget_group_ops: + _render_text_op(page, op) + + def _collect_page_redactions( doc: pymupdf.Document, paragraphs: list[dict], render_context: dict[str, Any] | None, - font_cache: _FontCache | None = None, ) -> dict[int, list[dict]]: page_ops: dict[int, list[dict]] = {} + widget_ops: dict[int, list[dict]] = {} + signature_widget_ops: dict[int, list[dict]] = {} line_x_cursor: dict[tuple[int, int, int], float] = {} + line_char_cache: dict[tuple[int, int, int], list[dict[str, Any]]] = {} - # Pre-compute image rects per page + # Pre-compute image rects and widgets per page page_image_rects: dict[int, list[pymupdf.Rect]] = {} + page_widgets: dict[int, list[dict[str, Any]]] = {} for paragraph in paragraphs: metadata = paragraph.get("metadata") or {} @@ -835,14 +1184,28 @@ def _collect_page_redactions( labels = sorted(paragraph.get("labels") or [], key=_label_start) search_cursor = 0 - # Lazy-load image rects for this page + # Lazy-load image rects and widget infos for this page if page_index not in page_image_rects: page_image_rects[page_index] = _image_rects_for_clip(page, page.rect) + if page_index not in page_widgets: + page_widgets[page_index] = _page_widget_infos(page) for label in labels: entity_text = _label_surface_text(label, document).strip() if not entity_text: - entity_text = str(label.get("text") or "").strip() + # Fall back to raw label text only if alt processing was + # not applied (no alt attributes present at all). + attrs = label.get("attrs") or {} + alt_applied = any( + key in attrs + for key in ( + "aymurai_alt_text", + "aymurai_alt_start_char", + "aymurai_alt_end_char", + ) + ) + if not alt_applied: + entity_text = str(label.get("text") or "").strip() if not entity_text: continue @@ -859,6 +1222,43 @@ def _collect_page_redactions( if rect.intersects(box_clip) ] + # Check if this is a widget-backed entity before falling back to images + if fallback_rects: + fallback_widget = _entity_overlaps_widget( + fallback_rects[0], + page_widgets[page_index], + ) + if fallback_widget is not None: + if ( + fallback_widget["field_type"] + == pymupdf.PDF_WIDGET_TYPE_TEXT + ): + widget_ops.setdefault(page_index, []).append( + { + "widget_xref": fallback_widget["xref"], + "field_name": fallback_widget["field_name"], + "widget_info": fallback_widget, + "entity_text": entity_text, + "logical_token": token, + } + ) + continue + if ( + fallback_widget["field_type"] + == pymupdf.PDF_WIDGET_TYPE_SIGNATURE + ): + op = _build_page_op( + fallback_rects[0], + lines[0] if lines else None, + token, + entity_style=fallback_widget.get("style") or None, + ) + op["skip_background_fill"] = True + op["widget_xref"] = fallback_widget["xref"] + op["widget_rect"] = fallback_widget["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + continue + # Check if this is an image-based entity if not fallback_rects: img_match = _try_image_entity( @@ -872,8 +1272,6 @@ def _collect_page_redactions( img_match, lines[0] if lines else None, token, - page=page, - font_cache=font_cache, is_image=True, ) op["image_rect"] = img_match @@ -897,8 +1295,6 @@ def _collect_page_redactions( rect, fallback_line, token, - page=page, - font_cache=font_cache, is_image=(img_rect is not None), ) if img_rect is not None: @@ -917,7 +1313,16 @@ def _collect_page_redactions( search_cursor = span[1] # Collect line segments this entity spans - segments: list[tuple[dict, str, pymupdf.Rect]] = [] + segments: list[ + tuple[ + dict, + str, + pymupdf.Rect, + pymupdf.Rect | None, + dict, + dict[str, Any] | None, + ] + ] = [] for line in lines: overlap_start = max(span[0], line["start"]) overlap_end = min(span[1], line["end"]) @@ -928,8 +1333,32 @@ def _collect_page_redactions( if not segment_text: continue - rect = _pick_rect_group_for_segment( - page, line, segment_text, line_x_cursor + line_key = ( + line["page_index"], + line["box_index"], + line["line_index"], + ) + line_chars = line_char_cache.get(line_key) + if line_chars is None: + line_chars = _line_chars_from_page(page, line) + line_char_cache[line_key] = line_chars + + raw_start = ( + overlap_start - line["start"] + int(line.get("strip_offset", 0)) + ) + raw_end = overlap_end - line["start"] + int(line.get("strip_offset", 0)) + rect = _rect_from_char_slice(line_chars, raw_start, raw_end) + if rect is None: + rect = _pick_rect_group_for_segment( + page, + line, + segment_text, + line_x_cursor, + ) + + widget_info = _entity_overlaps_widget( + rect, + page_widgets[page_index], ) # Check for image overlap @@ -938,67 +1367,116 @@ def _collect_page_redactions( rect, page_image_rects[page_index], ) - segments.append((line, segment_text, rect, img_rect)) + + # Determine entity-specific style from the span that + # actually contains this text (not the line's dominant style) + offset_in_line = overlap_start - line["start"] + ent_style = _entity_style_from_spans(line, offset_in_line) + + segments.append( + (line, segment_text, rect, img_rect, ent_style, widget_info) + ) if not segments: continue if len(segments) == 1: - # Single-line entity: write the full token - line, _seg_text, rect, img_rect = segments[0] + # Single-line entity: route widget-backed content through the widget path. + line, _seg_text, rect, img_rect, ent_style, widget_info = segments[0] + if widget_info is not None: + if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_TEXT: + widget_ops.setdefault(page_index, []).append( + { + "widget_xref": widget_info["xref"], + "field_name": widget_info["field_name"], + "widget_info": widget_info, + "entity_text": entity_text, + "logical_token": token, + } + ) + continue + if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE: + op = _build_page_op( + rect, + line, + token, + entity_style=ent_style, + ) + op["skip_background_fill"] = True + op["widget_xref"] = widget_info["xref"] + op["widget_rect"] = widget_info["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + continue + op = _build_page_op( rect, line, token, - page=page, - font_cache=font_cache, is_image=(img_rect is not None), + entity_style=ent_style, ) if img_rect is not None: op["image_rect"] = img_rect page_ops.setdefault(page_index, []).append(op) else: - # Multi-line entity: write the token centered on the - # WIDEST segment only; blank the other segments. + # Multi-line entity: write the token on the widest segment only; blank the others. widest_idx = max( range(len(segments)), key=lambda i: segments[i][2].width, ) any_image = any(seg[3] is not None for seg in segments) - for seg_idx, (seg_line, _seg_text, seg_rect, seg_img) in enumerate( - segments - ): + signature_widget = None + if all(seg[5] is not None for seg in segments): + widget_xrefs = {int(seg[5]["xref"]) for seg in segments} + widget_types = {int(seg[5]["field_type"]) for seg in segments} + if len(widget_xrefs) == 1 and widget_types == { + pymupdf.PDF_WIDGET_TYPE_SIGNATURE + }: + signature_widget = segments[0][5] + + for seg_idx, ( + seg_line, + _seg_text, + seg_rect, + seg_img, + seg_style, + seg_widget, + ) in enumerate(segments): if seg_idx == widest_idx: - # Primary segment: render the token here op = _build_page_op( seg_rect, seg_line, token, - page=page, - font_cache=font_cache, - is_image=any_image, + is_image=(any_image and signature_widget is None), + entity_style=seg_style, ) - if seg_img is not None: + if seg_img is not None and signature_widget is None: op["image_rect"] = seg_img else: - # Secondary segment: just blank it (no text) op = _build_page_op( seg_rect, seg_line, token, - page=page, - font_cache=font_cache, - is_image=(seg_img is not None), + is_image=( + (seg_img is not None) and signature_widget is None + ), + entity_style=seg_style, ) - op["text"] = None # suppress text rendering + op["text"] = None op["fontsize"] = None - if seg_img is not None: + if seg_img is not None and signature_widget is None: op["image_rect"] = seg_img - page_ops.setdefault(page_index, []).append(op) + if signature_widget is not None: + op["skip_background_fill"] = True + op["widget_xref"] = signature_widget["xref"] + op["widget_rect"] = signature_widget["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + else: + page_ops.setdefault(page_index, []).append(op) - return page_ops + return page_ops, widget_ops, signature_widget_ops def _try_image_entity( @@ -1046,23 +1524,46 @@ def _try_image_entity( def _apply_redactions( doc: pymupdf.Document, page_ops: dict[int, list[dict]], - font_cache: _FontCache | None = None, + widget_ops: dict[int, list[dict]], + signature_widget_ops: dict[int, list[dict]], ) -> None: + _apply_widget_ops(doc, widget_ops) + _apply_signature_widget_ops(doc, signature_widget_ops) + for page_idx, ops in page_ops.items(): page = doc[page_idx] - # 1) Add text redaction annotations (non-image ops only). - # Image entities are handled separately with white-rect overlay - # to avoid PDF_REDACT_IMAGE_REMOVE which destroys ALL images on - # the page. + # Separate image ops from text ops + text_ops: list[dict] = [] + image_ops: list[dict] = [] for op in ops: - if not op.get("is_image"): - page.add_redact_annot( - op["redact_rect"], - text=None, - fill=(1, 1, 1), - cross_out=False, - ) + if op.get("is_image") and op.get("image_rect") is not None: + image_ops.append(op) + else: + text_ops.append(op) + + # ── Image entities: edit via OpenCV ────────────────────────── + # Group image ops by their image_rect so we render/edit each + # image only once even when multiple entities overlap it. + if image_ops: + img_groups: dict[tuple, list[dict]] = {} + for op in image_ops: + key = _rect_tuple(op["image_rect"]) + img_groups.setdefault(key, []).append(op) + + for rect_key, group_ops in img_groups.items(): + img_rect = pymupdf.Rect(rect_key) + _edit_image_with_opencv(page, img_rect, group_ops) + + # ── Text entities: standard redact flow ────────────────────── + # 1) Add text redaction annotations + for op in text_ops: + page.add_redact_annot( + op["redact_rect"], + text=None, + fill=(1, 1, 1), + cross_out=False, + ) # 2) Apply text redactions (images are never touched here) page.apply_redactions( @@ -1071,123 +1572,178 @@ def _apply_redactions( text=pymupdf.PDF_REDACT_TEXT_REMOVE, ) - # 3) Draw white canvas + centered replacement text - for op in ops: - is_image = op.get("is_image", False) + # 3) Draw replacement text after the redactions and image edits are in place. + for op in text_ops: + _render_text_op(page, op) + for op in image_ops: + _render_text_op(page, op) - if is_image: - # For image entities, paint a white rect that covers the - # FULL image bounding box (not just the entity text rect) - # so the original content is completely hidden. - img_rect = op.get("image_rect") - if img_rect is not None: - page.draw_rect( - img_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - # Always white-out the canvas area (text or image) - canvas = op["canvas_rect"] - page.draw_rect( - canvas, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) +def _edit_image_with_opencv( + page: pymupdf.Page, + img_rect: pymupdf.Rect, + ops: list[dict], +) -> None: + """Rasterise *img_rect* from *page*, remove the original entity pixels, + and overlay the edited image back onto the page. - if not op.get("text") or not op.get("fontsize"): - continue + Tags are rendered afterwards with the normal PDF text path so they stay + sharp and aligned with the surrounding text instead of being rasterised by + OpenCV. + """ + scale = _IMAGE_EDIT_DPI / 72.0 + mat = pymupdf.Matrix(scale, scale) - render = op["render_rect"] - style = op.get("style") or {} + try: + pix = page.get_pixmap(clip=img_rect, matrix=mat, alpha=False) + except Exception as exc: + logger.warning("Could not rasterise image region %s: %s", img_rect, exc) + page.draw_rect( + img_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + return + + img = ( + np.frombuffer(pix.samples, dtype=np.uint8) + .reshape( + pix.height, + pix.width, + pix.n, + ) + .copy() + ) + if pix.n >= 3: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + mask = np.zeros(img.shape[:2], dtype=np.uint8) + for op in ops: + canvas = op["canvas_rect"] + x0 = max(int((canvas.x0 - img_rect.x0) * scale), 0) + y0 = max(int((canvas.y0 - img_rect.y0) * scale), 0) + x1 = min(int((canvas.x1 - img_rect.x0) * scale), img.shape[1]) + y1 = min(int((canvas.y1 - img_rect.y0) * scale), img.shape[0]) + + if x1 <= x0 or y1 <= y0: + continue - # --- Text insertion strategy --- - # ``page.insert_textbox`` / ``insert_text`` do NOT support fonts - # registered via ``page.insert_font`` — they only understand - # Base-14 names or ``fontfile`` paths. We therefore use - # ``TextWriter.fill_textbox`` which accepts a ``pymupdf.Font`` - # object built directly from the cached buffer, giving us both - # correct typeface and native center alignment. + mask[y0:y1, x0:x1] = 255 - written = False + if np.any(mask): + if _IMAGE_EDIT_MASK_DILATE > 0: + kernel = np.ones((3, 3), dtype=np.uint8) + mask = cv2.dilate(mask, kernel, iterations=_IMAGE_EDIT_MASK_DILATE) + try: + img = cv2.inpaint(img, mask, _IMAGE_EDIT_INPAINT_RADIUS, cv2.INPAINT_TELEA) + except Exception as exc: + logger.warning("OpenCV inpaint failed for rect %s: %s", img_rect, exc) + img[mask > 0] = 255 + + success, png_buf = cv2.imencode(".png", img) + if not success: + logger.warning("Failed to encode edited image for rect %s", img_rect) + page.draw_rect( + img_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + return - # Attempt 1: TextWriter with original font buffer - if font_cache is not None and not written: - font_obj = _make_font_obj(font_cache, style) - if font_obj is not None: - try: - tw = pymupdf.TextWriter(page.rect, color=op["text_color"]) - tw.fill_textbox( - render, - op["text"], - font=font_obj, - fontsize=op["fontsize"], - align=pymupdf.TEXT_ALIGN_CENTER, - ) - tw.write_text(page, overlay=True) - written = True - except Exception as exc: - logger.debug( - "TextWriter failed for '%s': %s", - op["text"], - exc, - ) + try: + page.insert_image(img_rect, stream=png_buf.tobytes(), overlay=True) + except Exception as exc: + logger.warning( + "Failed to re-insert edited image for rect %s: %s", img_rect, exc + ) + page.draw_rect( + img_rect, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) - # Attempt 2: insert_textbox with Base-14 fallback font - if not written: - base14 = _base14_fontname_for_style(style) - try: - page.insert_textbox( - render, - op["text"], - fontname=base14, - fontsize=op["fontsize"], - color=op["text_color"], - align=pymupdf.TEXT_ALIGN_CENTER, - overlay=True, - ) - written = True - except Exception as exc: - logger.debug( - "insert_textbox (Base-14) failed for '%s': %s", - op["text"], - exc, - ) - # Attempt 3: insert_text centered with Base-14 - if not written: - base14 = _base14_fontname_for_style(style) - try: - descender = 0.2 - baseline_y = render.y1 - (descender * op["fontsize"]) - baseline_y = min( - max(baseline_y, render.y0 + 1.0), - render.y1 - 0.25, - ) - text_w = pymupdf.get_text_length( - op["text"], - fontname=base14, - fontsize=op["fontsize"], - ) - x_start = render.x0 + max((render.width - text_w) / 2.0, 0.0) - page.insert_text( - (x_start, baseline_y), - op["text"], - fontname=base14, - fontsize=op["fontsize"], - color=op["text_color"], - overlay=True, - ) - except Exception as exc: - logger.warning( - "All text insertion methods failed for '%s': %s", - op["text"], - exc, - ) +def _render_text_op(page: pymupdf.Page, op: dict) -> None: + """Render a single anonymisation tag onto *page*.""" + canvas = op["canvas_rect"] + if not op.get("skip_background_fill"): + page.draw_rect( + canvas, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + + if not op.get("text") or not op.get("fontsize"): + return + + render = op["render_rect"] + line_rect = pymupdf.Rect(op.get("line_rect") or render) + style = op.get("style") or {} + base14_name = _base14_fontname_for_style(style) + font_obj = _get_base14_font(style) + + fontsize = float(op["fontsize"]) + descender = float(style.get("descender") or -0.2) + baseline_y = line_rect.y1 + (descender * fontsize) + baseline_y = min( + max(baseline_y, line_rect.y0 + (fontsize * 0.65)), + line_rect.y1 - 0.1, + ) + + text_width = font_obj.text_length(op["text"], fontsize=fontsize) + x_start = render.x0 + max((render.width - text_width) / 2.0, 0.0) + + try: + page.insert_text( + (x_start, baseline_y), + op["text"], + fontname=base14_name, + fontsize=fontsize, + color=op["text_color"], + overlay=True, + ) + return + except Exception as exc: + logger.debug("insert_text failed for '%s': %s", op["text"], exc) + + try: + tw = pymupdf.TextWriter(page.rect, color=op["text_color"]) + tw.fill_textbox( + render, + op["text"], + font=font_obj, + fontsize=fontsize, + align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), + ) + tw.write_text(page, overlay=True) + return + except Exception as exc: + logger.debug("TextWriter failed for '%s': %s", op["text"], exc) + + try: + page.insert_textbox( + render, + op["text"], + fontname=base14_name, + fontsize=fontsize, + color=op["text_color"], + align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), + overlay=True, + ) + except Exception as exc: + logger.warning( + "All text insertion methods failed for '%s': %s", + op["text"], + exc, + ) def _add_footer_watermark(doc: pymupdf.Document) -> None: @@ -1235,20 +1791,16 @@ def anonymize( force_ocr=False, ) - # Build font cache to preserve original typefaces - font_cache = _FontCache(doc) - layout_paragraphs = _build_layout_paragraphs(parsed_doc) matched_paragraphs = _match_predictions_to_layout(layout_paragraphs, preds) _apply_minimal_boundary_merge(matched_paragraphs, render_context) - page_ops = _collect_page_redactions( + page_ops, widget_ops, signature_widget_ops = _collect_page_redactions( doc, matched_paragraphs, render_context, - font_cache=font_cache, ) - _apply_redactions(doc, page_ops, font_cache=font_cache) + _apply_redactions(doc, page_ops, widget_ops, signature_widget_ops) _add_footer_watermark(doc) os.makedirs(output_dir, exist_ok=True) diff --git a/pyproject.toml b/pyproject.toml index b5bb6d6..c572162 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,7 @@ dependencies = [ "sentence-transformers>=2.2.0", "pymupdf>=1.25.2", "pymupdf4llm>=0.0.17", + "opencv-python-headless>=4.5.0", "pypandoc>=1.15", "python-docx>=1.2.0", "docx2txt>=0.9", From cbcc235bbcf59187eb78b6c5115766fb62ba9665 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 17:13:14 +0000 Subject: [PATCH 11/28] =?UTF-8?q?=F0=9F=94=A7=20Update=20model=5Fdump=20ca?= =?UTF-8?q?lls=20to=20exclude=20None=20values=20for=20improved=20data=20ha?= =?UTF-8?q?ndling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/api/endpoints/routers/anonymizer/anonymizer.py | 2 +- aymurai/database/crud/anonymization/paragraph.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py index ca9da63..6a48c33 100644 --- a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py +++ b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py @@ -594,7 +594,7 @@ async def anonymizer_compile_document( ) preds = [ - document_information.model_dump() + document_information.model_dump(mode="json", exclude_none=True) for document_information in filtered_annotations ] diff --git a/aymurai/database/crud/anonymization/paragraph.py b/aymurai/database/crud/anonymization/paragraph.py index 1d16903..17f826b 100644 --- a/aymurai/database/crud/anonymization/paragraph.py +++ b/aymurai/database/crud/anonymization/paragraph.py @@ -27,7 +27,7 @@ def _serialize_doclabels(value: list[DocLabel] | None): """ if value is None: return None - return _DOC_LABELS_ADAPTER.dump_python(value, mode="json") + return _DOC_LABELS_ADAPTER.dump_python(value, mode="json", exclude_none=True) def _normalize_paragraph_payload(payload: dict) -> dict: @@ -63,7 +63,7 @@ def anonymization_paragraph_create( Returns: AnonymizationParagraph: The persisted paragraph record. """ - payload = _normalize_paragraph_payload(paragraph_in.model_dump()) + payload = _normalize_paragraph_payload(paragraph_in.model_dump(exclude_none=True)) new_paragraph = AnonymizationParagraph(**payload) if override: @@ -171,14 +171,14 @@ def anonymization_paragraph_batch_create_update( paragraph = session.get(AnonymizationParagraph, paragraph_id) if paragraph: - payload = _normalize_paragraph_payload(p_in.model_dump()) + payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True)) payload.pop("id", None) for field, value in payload.items(): if value is not None: setattr(paragraph, field, value) else: - payload = _normalize_paragraph_payload(p_in.model_dump()) + payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True)) paragraph = AnonymizationParagraph(**payload) session.add(paragraph) From b452034c047b3b7b582890b4058abf856b5282cf Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 17:15:22 +0000 Subject: [PATCH 12/28] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20la?= =?UTF-8?q?bel=20replacement=20functions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/alignment.py | 35 +++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/aymurai/text/anonymization/alignment.py b/aymurai/text/anonymization/alignment.py index 21287c9..5ca920a 100644 --- a/aymurai/text/anonymization/alignment.py +++ b/aymurai/text/anonymization/alignment.py @@ -62,6 +62,15 @@ def resolve_render_token(label: dict, render_context: dict | None = None) -> str def _label_replacement_start(label: dict) -> int: + """ + Determines the start character index for a label, considering possible alternative attributes. + + Args: + label (dict): Label dictionary which may contain alternative start character attributes. + + Returns: + int: The start character index for the label. + """ attrs = label.get("attrs") or {} alt_start = attrs.get("aymurai_alt_start_char") start_char = label.get("start_char") @@ -69,6 +78,15 @@ def _label_replacement_start(label: dict) -> int: def _label_replacement_end(label: dict) -> int: + """ + Determines the end character index for a label, considering possible alternative attributes. + + Args: + label (dict): Label dictionary which may contain alternative end character attributes. + + Returns: + int: The end character index for the label. + """ attrs = label.get("attrs") or {} alt_end = attrs.get("aymurai_alt_end_char") end_char = label.get("end_char") @@ -76,10 +94,20 @@ def _label_replacement_end(label: dict) -> int: def _label_replacement_text(label: dict, document: str) -> str: + """ + Determines the replacement text for a label, considering possible alternative attributes. + + Args: + label (dict): Label dictionary which may contain alternative text attributes. + document (str): The document text from which to extract the label text. + + Returns: + str: The text for the label, considering possible alternative attributes. + """ attrs = label.get("attrs") or {} - if "aymurai_alt_text" in attrs: - alt_text = attrs["aymurai_alt_text"] + alt_text = attrs.get("aymurai_alt_text") + if alt_text is not None: return str(alt_text) if alt_text else "" alt_start = attrs.get("aymurai_alt_start_char") @@ -90,9 +118,6 @@ def _label_replacement_text(label: dict, document: str) -> str: return document[start_char:end_char] return "" - if "aymurai_alt_start_char" in attrs and alt_start is None: - return "" - start_char = int(label.get("start_char") or 0) end_char = int(label.get("end_char") or 0) if 0 <= start_char < end_char <= len(document): From f3f9f34cfe26f3c5e319a8a424be685a8ae73147 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 17:16:09 +0000 Subject: [PATCH 13/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20watermark?= =?UTF-8?q?=20handling=20and=20optimize=20PDF=20token=20aliasing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/pdf.py | 630 +++++++++++++++++------------- 1 file changed, 369 insertions(+), 261 deletions(-) diff --git a/aymurai/text/anonymization/pdf.py b/aymurai/text/anonymization/pdf.py index 23840cf..50813c4 100644 --- a/aymurai/text/anonymization/pdf.py +++ b/aymurai/text/anonymization/pdf.py @@ -3,12 +3,11 @@ import os import re from copy import deepcopy +from functools import lru_cache from pathlib import Path from typing import Any from unicodedata import normalize -import cv2 -import numpy as np import pymupdf import pymupdf.layout # noqa: F401 # activates layout support from jiwer import cer @@ -24,7 +23,16 @@ logger = get_logger(__name__) -WATERMARK_TEXT = "Documento anonimizado por AymurAI | https://www.aymurai.info/" +WATERMARK_PREFIX_TEXT = "Documento anonimizado por " +WATERMARK_LINK_TEXT = "AymurAI" +WATERMARK_TEXT = f"{WATERMARK_PREFIX_TEXT}{WATERMARK_LINK_TEXT}" +WATERMARK_URL = "https://www.aymurai.info/" +WATERMARK_FONT_FAMILY = "Archivo" +WATERMARK_FONT_SIZE = 10.0 +WATERMARK_MARGIN_X = 24.0 +WATERMARK_BASELINE_MARGIN = 12.0 +WATERMARK_TEXT_COLOR = tuple(channel / 255 for channel in (192, 192, 192)) +WATERMARK_LINK_COLOR = tuple(channel / 255 for channel in (115, 190, 250)) TEXT_FLAG_ITALIC = 2 TEXT_FLAG_SERIF = 4 @@ -33,6 +41,23 @@ PDF_TAG_MIN_FONT_SIZE = 7.0 PDF_TAG_FONT_STEP = 0.5 PDF_TAG_MAX_ABBREVIATION = 3 +PDF_TOKEN_ALIAS_MAP: dict[str, tuple[str, str]] = { + "CORREO_ELECTRONICO": ("CORREO", "MAIL"), + "CUIT_CUIL": ("CUIT", "CUIL"), + "DIRECCION": ("DIREC", "DIR"), + "ESTUDIOS": ("ESTUD", "EDU"), + "MARCA_AUTOMOVIL": ("MARCA_AUTO", "AUTO"), + "NACIONALIDAD": ("NACIONAL", "NAC"), + "NOMBRE_ARCHIVO": ("NOM_ARCH", "ARCH"), + "NUM_ACTUACION": ("NUM_ACT", "ACT"), + "NUM_CAJA_AHORRO": ("NUM_CAJA", "CAJA"), + "NUM_EXPEDIENTE": ("NUM_EXP", "EXPTE"), + "NUM_MATRICULA": ("NUM_MAT", "MAT"), + "PATENTE_DOMINIO": ("PAT_DOM", "PAT"), + "TELEFONO": ("TELEF", "TEL"), + "TEXTO_ANONIMIZAR": ("TEXTO_ANON", "ANON"), + "USUARIX": ("USUAR", "USR"), +} PDF_TAG_RECT_X_PADDING = 0.5 PDF_TAG_RECT_Y_PADDING = 0.0 PDF_TAG_RECT_INSET = 0.5 @@ -43,11 +68,6 @@ # Vertical overlap ratio required to consider two image rects as matching _IMAGE_OVERLAP_THRESHOLD = 0.3 -# DPI used to rasterise PDF image regions for OpenCV editing. -_IMAGE_EDIT_DPI = 200 -_IMAGE_EDIT_MASK_DILATE = 1 -_IMAGE_EDIT_INPAINT_RADIUS = 3 - def _line_text(line: dict) -> str: return "".join(span.get("text", "") for span in line.get("spans", [])) @@ -258,9 +278,9 @@ def _label_end(label: dict) -> int: def _label_surface_text(label: dict, document: str) -> str: attrs = label.get("attrs") or {} - # Prefer explicit alt text when the key is present - if "aymurai_alt_text" in attrs: - alt_text = attrs["aymurai_alt_text"] + # Prefer explicit alt text when it has an actual value. + alt_text = attrs.get("aymurai_alt_text") + if alt_text is not None: return str(alt_text) if alt_text else "" # Use alt char offsets when available @@ -271,11 +291,6 @@ def _label_surface_text(label: dict, document: str) -> str: start, end = int(alt_start), int(alt_end) if 0 <= start < end <= len(document): return document[start:end] - # Alt range is empty/invalid — alt processing cleared this label - return "" - - # If alt keys exist but values are None, alt processing cleared this label - if "aymurai_alt_start_char" in attrs and alt_start is None: return "" # No alt info available; use raw char offsets @@ -332,6 +347,22 @@ def _abbreviate_token(base: str, length: int) -> str: return normalized[:length] or normalized[:1] or "E" +def _token_aliases(base: str) -> tuple[str, ...]: + aliases = PDF_TOKEN_ALIAS_MAP.get(base.upper(), ()) + normalized_aliases: list[str] = [] + + for alias in aliases: + normalized = re.sub(r"[^A-Z0-9_]", "", str(alias).upper()) + if ( + normalized + and normalized != base.upper() + and normalized not in normalized_aliases + ): + normalized_aliases.append(normalized) + + return tuple(normalized_aliases) + + def _build_display_token_candidates(token: str) -> list[str]: base, suffix = _token_parts(token.upper()) candidates: list[str] = [] @@ -340,15 +371,18 @@ def add(value: str) -> None: if value and value not in candidates: candidates.append(value) - if suffix: - add(f"<{base}_{suffix}>") - add(f"<{base}>") - - for length in (PDF_TAG_MAX_ABBREVIATION, 1): - abbreviated = _abbreviate_token(base, length) + def add_base_variants(label: str) -> None: if suffix: - add(f"<{abbreviated}_{suffix}>") - add(f"<{abbreviated}>") + add(f"<{label}_{suffix}>") + add(f"<{label}>") + + add_base_variants(base) + + for alias in _token_aliases(base): + add_base_variants(alias) + + abbreviated = _abbreviate_token(base, PDF_TAG_MAX_ABBREVIATION) + add_base_variants(abbreviated) return candidates @@ -748,6 +782,51 @@ def _line_chars_from_page(page: pymupdf.Page, line: dict) -> list[dict[str, Any] return best_chars +def _line_chars_text(chars: list[dict[str, Any]]) -> str: + return "".join(str(entry.get("char") or "") for entry in chars) + + +def _find_line_char_span( + chars: list[dict[str, Any]], + text: str, + *, + start: int = 0, + raw_text: str | None = None, +) -> tuple[int, int] | None: + """ + Match *text* against the raw character stream for a line. + + ``line["text"]`` comes from PyMuPDF layout text and can differ from the + raw character stream returned by ``rawdict``. Searching the raw stream + keeps the redaction rectangle aligned with the actual glyph boxes. + """ + if not chars or not text: + return None + + haystack = raw_text if raw_text is not None else _line_chars_text(chars) + pattern = _build_flexible_pattern(text) + + def _search(offset: int) -> tuple[int, int] | None: + exact_idx = haystack.find(text, offset) + flexible_span = None + if pattern: + match = re.search(pattern, haystack[offset:]) + if match is not None: + flexible_span = (offset + match.start(), offset + match.end()) + + if exact_idx < 0: + return flexible_span + exact_span = (exact_idx, exact_idx + len(text)) + if flexible_span is None: + return exact_span + return min(exact_span, flexible_span, key=lambda span: span[0]) + + span = _search(start) + if span is None and start > 0: + span = _search(0) + return span + + def _rect_from_char_slice( chars: list[dict[str, Any]], start: int, @@ -813,6 +892,7 @@ def _build_page_op( return { "redact_rect": _text_redact_rect(rect), + "background_rect": canvas_rect, "canvas_rect": canvas_rect, "render_rect": render_rect, "line_rect": line_clip, @@ -822,12 +902,32 @@ def _build_page_op( "fontsize": fitted_size, "text_align": pymupdf.TEXT_ALIGN_LEFT, "text_color": style.get("color") or (0.0, 0.0, 0.0), - "is_image": is_image, - "skip_background_fill": is_image, "style": style, } +def _signature_background_rect( + op: dict[str, Any], + widget_rect: pymupdf.Rect, +) -> pymupdf.Rect: + background = pymupdf.Rect( + op.get("line_rect") or op.get("canvas_rect") or widget_rect + ) + canvas_rect = op.get("canvas_rect") + if canvas_rect is not None: + background.include_rect(pymupdf.Rect(canvas_rect)) + + pad_x = max(background.height * 0.75, 2.0) + pad_y = max(background.height * 0.25, 0.75) + widget_clip = pymupdf.Rect(widget_rect) + + background.x0 = max(widget_clip.x0, background.x0 - pad_x) + background.y0 = max(widget_clip.y0, background.y0 - pad_y) + background.x1 = min(widget_clip.x1, background.x1 + pad_x) + background.y1 = min(widget_clip.y1, background.y1 + pad_y) + return background + + def _image_rects_for_clip( page: pymupdf.Page, clip: pymupdf.Rect, @@ -1040,27 +1140,30 @@ def _apply_signature_widget_ops( grouped.setdefault(int(op["widget_xref"]), []).append(op) for widget_xref, widget_group_ops in grouped.items(): - widget_rect = pymupdf.Rect(widget_group_ops[0]["widget_rect"]) - - try: - pix = page.get_pixmap( - clip=widget_rect, - matrix=pymupdf.Matrix( - _IMAGE_EDIT_DPI / 72.0, _IMAGE_EDIT_DPI / 72.0 - ), - alpha=False, - ) - except Exception as exc: - logger.warning( - "Could not rasterise signature widget xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - pix = None - widget = widgets.get(widget_xref) + widget_rect = pymupdf.Rect( + widget_group_ops[0].get("widget_rect") or (0, 0, 0, 0) + ) + appearance_png: bytes | None = None + if widget is not None: + widget_rect = pymupdf.Rect(widget.rect) + try: + scale = 200 / 72.0 + pix = page.get_pixmap( + clip=widget_rect, + matrix=pymupdf.Matrix(scale, scale), + alpha=False, + ) + appearance_png = pix.tobytes("png") + except Exception as exc: + logger.warning( + "Could not snapshot signature widget xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + try: page.delete_widget(widget) except Exception as exc: @@ -1070,88 +1173,27 @@ def _apply_signature_widget_ops( page_idx, exc, ) - - if pix is None: - page.draw_rect( - widget_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) + appearance_png = None else: - img = ( - np.frombuffer(pix.samples, dtype=np.uint8) - .reshape(pix.height, pix.width, pix.n) - .copy() + logger.warning( + "Could not resolve PDF signature widget xref=%s on page=%s", + widget_xref, + page_idx, ) - if pix.n >= 3: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - scale = _IMAGE_EDIT_DPI / 72.0 - mask = np.zeros(img.shape[:2], dtype=np.uint8) - for op in widget_group_ops: - canvas = op["canvas_rect"] - x0 = max(int((canvas.x0 - widget_rect.x0) * scale), 0) - y0 = max(int((canvas.y0 - widget_rect.y0) * scale), 0) - x1 = min(int((canvas.x1 - widget_rect.x0) * scale), img.shape[1]) - y1 = min(int((canvas.y1 - widget_rect.y0) * scale), img.shape[0]) - if x1 <= x0 or y1 <= y0: - continue - mask[y0:y1, x0:x1] = 255 - if np.any(mask): - if _IMAGE_EDIT_MASK_DILATE > 0: - kernel = np.ones((3, 3), dtype=np.uint8) - mask = cv2.dilate( - mask, kernel, iterations=_IMAGE_EDIT_MASK_DILATE - ) - try: - img = cv2.inpaint( - img, - mask, - _IMAGE_EDIT_INPAINT_RADIUS, - cv2.INPAINT_TELEA, - ) - except Exception as exc: - logger.warning( - "OpenCV inpaint failed for signature widget xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - img[mask > 0] = 255 - - success, png_buf = cv2.imencode(".png", img) - if success: - try: - page.insert_image( - widget_rect, stream=png_buf.tobytes(), overlay=True - ) - except Exception as exc: - logger.warning( - "Failed to insert edited signature widget image xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - page.draw_rect( - widget_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - else: - page.draw_rect( - widget_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, + if appearance_png and widget_rect.get_area() > 0: + try: + page.insert_image(widget_rect, stream=appearance_png, overlay=True) + except Exception as exc: + logger.warning( + "Failed to restore signature widget appearance xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, ) for op in widget_group_ops: + op["background_rect"] = _signature_background_rect(op, widget_rect) _render_text_op(page, op) @@ -1165,6 +1207,8 @@ def _collect_page_redactions( signature_widget_ops: dict[int, list[dict]] = {} line_x_cursor: dict[tuple[int, int, int], float] = {} line_char_cache: dict[tuple[int, int, int], list[dict[str, Any]]] = {} + line_char_text_cache: dict[tuple[int, int, int], str] = {} + line_char_cursor: dict[tuple[int, int, int], int] = {} # Pre-compute image rects and widgets per page page_image_rects: dict[int, list[pymupdf.Rect]] = {} @@ -1253,7 +1297,6 @@ def _collect_page_redactions( token, entity_style=fallback_widget.get("style") or None, ) - op["skip_background_fill"] = True op["widget_xref"] = fallback_widget["xref"] op["widget_rect"] = fallback_widget["rect"] signature_widget_ops.setdefault(page_index, []).append(op) @@ -1343,11 +1386,30 @@ def _collect_page_redactions( line_chars = _line_chars_from_page(page, line) line_char_cache[line_key] = line_chars - raw_start = ( - overlap_start - line["start"] + int(line.get("strip_offset", 0)) + line_char_text = line_char_text_cache.get(line_key) + if line_char_text is None: + line_char_text = _line_chars_text(line_chars) + line_char_text_cache[line_key] = line_char_text + + raw_span = _find_line_char_span( + line_chars, + segment_text, + start=line_char_cursor.get(line_key, 0), + raw_text=line_char_text, ) - raw_end = overlap_end - line["start"] + int(line.get("strip_offset", 0)) - rect = _rect_from_char_slice(line_chars, raw_start, raw_end) + rect = None + if raw_span is not None: + line_char_cursor[line_key] = raw_span[1] + rect = _rect_from_char_slice(line_chars, raw_span[0], raw_span[1]) + + if rect is None: + raw_start = ( + overlap_start - line["start"] + int(line.get("strip_offset", 0)) + ) + raw_end = ( + overlap_end - line["start"] + int(line.get("strip_offset", 0)) + ) + rect = _rect_from_char_slice(line_chars, raw_start, raw_end) if rect is None: rect = _pick_rect_group_for_segment( page, @@ -1402,7 +1464,6 @@ def _collect_page_redactions( token, entity_style=ent_style, ) - op["skip_background_fill"] = True op["widget_xref"] = widget_info["xref"] op["widget_rect"] = widget_info["rect"] signature_widget_ops.setdefault(page_index, []).append(op) @@ -1469,7 +1530,6 @@ def _collect_page_redactions( op["image_rect"] = seg_img if signature_widget is not None: - op["skip_background_fill"] = True op["widget_xref"] = signature_widget["xref"] op["widget_rect"] = signature_widget["rect"] signature_widget_ops.setdefault(page_index, []).append(op) @@ -1533,31 +1593,7 @@ def _apply_redactions( for page_idx, ops in page_ops.items(): page = doc[page_idx] - # Separate image ops from text ops - text_ops: list[dict] = [] - image_ops: list[dict] = [] for op in ops: - if op.get("is_image") and op.get("image_rect") is not None: - image_ops.append(op) - else: - text_ops.append(op) - - # ── Image entities: edit via OpenCV ────────────────────────── - # Group image ops by their image_rect so we render/edit each - # image only once even when multiple entities overlap it. - if image_ops: - img_groups: dict[tuple, list[dict]] = {} - for op in image_ops: - key = _rect_tuple(op["image_rect"]) - img_groups.setdefault(key, []).append(op) - - for rect_key, group_ops in img_groups.items(): - img_rect = pymupdf.Rect(rect_key) - _edit_image_with_opencv(page, img_rect, group_ops) - - # ── Text entities: standard redact flow ────────────────────── - # 1) Add text redaction annotations - for op in text_ops: page.add_redact_annot( op["redact_rect"], text=None, @@ -1565,113 +1601,19 @@ def _apply_redactions( cross_out=False, ) - # 2) Apply text redactions (images are never touched here) page.apply_redactions( images=pymupdf.PDF_REDACT_IMAGE_NONE, graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, text=pymupdf.PDF_REDACT_TEXT_REMOVE, ) - # 3) Draw replacement text after the redactions and image edits are in place. - for op in text_ops: - _render_text_op(page, op) - for op in image_ops: + for op in ops: _render_text_op(page, op) -def _edit_image_with_opencv( - page: pymupdf.Page, - img_rect: pymupdf.Rect, - ops: list[dict], -) -> None: - """Rasterise *img_rect* from *page*, remove the original entity pixels, - and overlay the edited image back onto the page. - - Tags are rendered afterwards with the normal PDF text path so they stay - sharp and aligned with the surrounding text instead of being rasterised by - OpenCV. - """ - scale = _IMAGE_EDIT_DPI / 72.0 - mat = pymupdf.Matrix(scale, scale) - - try: - pix = page.get_pixmap(clip=img_rect, matrix=mat, alpha=False) - except Exception as exc: - logger.warning("Could not rasterise image region %s: %s", img_rect, exc) - page.draw_rect( - img_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - return - - img = ( - np.frombuffer(pix.samples, dtype=np.uint8) - .reshape( - pix.height, - pix.width, - pix.n, - ) - .copy() - ) - if pix.n >= 3: - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - mask = np.zeros(img.shape[:2], dtype=np.uint8) - for op in ops: - canvas = op["canvas_rect"] - x0 = max(int((canvas.x0 - img_rect.x0) * scale), 0) - y0 = max(int((canvas.y0 - img_rect.y0) * scale), 0) - x1 = min(int((canvas.x1 - img_rect.x0) * scale), img.shape[1]) - y1 = min(int((canvas.y1 - img_rect.y0) * scale), img.shape[0]) - - if x1 <= x0 or y1 <= y0: - continue - - mask[y0:y1, x0:x1] = 255 - - if np.any(mask): - if _IMAGE_EDIT_MASK_DILATE > 0: - kernel = np.ones((3, 3), dtype=np.uint8) - mask = cv2.dilate(mask, kernel, iterations=_IMAGE_EDIT_MASK_DILATE) - try: - img = cv2.inpaint(img, mask, _IMAGE_EDIT_INPAINT_RADIUS, cv2.INPAINT_TELEA) - except Exception as exc: - logger.warning("OpenCV inpaint failed for rect %s: %s", img_rect, exc) - img[mask > 0] = 255 - - success, png_buf = cv2.imencode(".png", img) - if not success: - logger.warning("Failed to encode edited image for rect %s", img_rect) - page.draw_rect( - img_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - return - - try: - page.insert_image(img_rect, stream=png_buf.tobytes(), overlay=True) - except Exception as exc: - logger.warning( - "Failed to re-insert edited image for rect %s: %s", img_rect, exc - ) - page.draw_rect( - img_rect, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - - def _render_text_op(page: pymupdf.Page, op: dict) -> None: """Render a single anonymisation tag onto *page*.""" - canvas = op["canvas_rect"] + canvas = pymupdf.Rect(op.get("background_rect") or op["canvas_rect"]) if not op.get("skip_background_fill"): page.draw_rect( canvas, @@ -1746,21 +1688,187 @@ def _render_text_op(page: pymupdf.Page, op: dict) -> None: ) +@lru_cache(maxsize=1) +def _watermark_font_paths() -> tuple[str | None, str | None]: + search_roots = [ + Path("/workspace"), + Path("/usr/share/fonts"), + Path("/usr/local/share/fonts"), + Path.home() / ".local/share/fonts", + ] + candidates: list[Path] = [] + seen: set[str] = set() + + for root in search_roots: + if not root.exists(): + continue + try: + iterator = root.rglob("*") + except Exception: + continue + for path in iterator: + if not path.is_file() or path.suffix.lower() not in { + ".ttf", + ".otf", + ".ttc", + }: + continue + if "archivo" not in path.name.lower(): + continue + resolved = str(path.resolve()) + if resolved not in seen: + seen.add(resolved) + candidates.append(path) + + candidates = sorted(candidates, key=lambda item: item.name.lower()) + regular_path: str | None = None + bold_path: str | None = None + + for path in candidates: + name = path.name.lower() + if regular_path is None and "bold" not in name and "italic" not in name: + regular_path = str(path) + if bold_path is None and "bold" in name: + bold_path = str(path) + + if regular_path is None and candidates: + regular_path = str(candidates[0]) + if bold_path is None: + bold_path = regular_path + + return regular_path, bold_path + + +@lru_cache(maxsize=1) +def _watermark_font_config() -> dict[str, Any]: + regular_path, bold_path = _watermark_font_paths() + if regular_path: + try: + return { + "text_fontname": "archivo-watermark", + "text_fontfile": regular_path, + "text_font": pymupdf.Font(fontfile=regular_path), + "link_fontname": "archivo-watermark-bold", + "link_fontfile": bold_path or regular_path, + "link_font": pymupdf.Font(fontfile=bold_path or regular_path), + } + except Exception as exc: + logger.warning( + "Could not load Archivo font for PDF watermark, falling back to Helvetica: %s", + exc, + ) + + return { + "text_fontname": "Helvetica", + "text_fontfile": None, + "text_font": pymupdf.Font("Helvetica"), + "link_fontname": "Helvetica-Bold", + "link_fontfile": None, + "link_font": pymupdf.Font("Helvetica-Bold"), + } + + +def _watermark_text_length( + text: str, + *, + font_obj: pymupdf.Font, + fontname: str, + fontsize: float, +) -> float: + try: + return float(font_obj.text_length(text, fontsize=fontsize)) + except Exception: + return float( + pymupdf.get_text_length(text, fontname=fontname, fontsize=fontsize) + ) + + +def _insert_watermark_text( + page: pymupdf.Page, + point: tuple[float, float], + text: str, + *, + fontname: str, + fontsize: float, + color: tuple[float, float, float], + fontfile: str | None = None, +) -> None: + kwargs: dict[str, Any] = { + "fontsize": fontsize, + "fontname": fontname, + "color": color, + "overlay": True, + } + if fontfile: + kwargs["fontfile"] = fontfile + page.insert_text(point, text, **kwargs) + + def _add_footer_watermark(doc: pymupdf.Document) -> None: - for page in doc: - text_width = pymupdf.get_text_length( - WATERMARK_TEXT, - fontname="helv", - fontsize=8, + font_config = _watermark_font_config() + prefix_width = _watermark_text_length( + WATERMARK_PREFIX_TEXT, + font_obj=font_config["text_font"], + fontname=font_config["text_fontname"], + fontsize=WATERMARK_FONT_SIZE, + ) + link_width = _watermark_text_length( + WATERMARK_LINK_TEXT, + font_obj=font_config["link_font"], + fontname=font_config["link_fontname"], + fontsize=WATERMARK_FONT_SIZE, + ) + total_width = prefix_width + link_width + + for page_index, page in enumerate(doc): + if page_index % 2 == 0: + x_start = max( + WATERMARK_MARGIN_X, page.rect.width - total_width - WATERMARK_MARGIN_X + ) + else: + x_start = WATERMARK_MARGIN_X + + baseline_y = page.rect.height - WATERMARK_BASELINE_MARGIN + link_x = x_start + prefix_width + + _insert_watermark_text( + page, + (x_start, baseline_y), + WATERMARK_PREFIX_TEXT, + fontname=font_config["text_fontname"], + fontsize=WATERMARK_FONT_SIZE, + color=WATERMARK_TEXT_COLOR, + fontfile=font_config["text_fontfile"], ) - x_pos = max(24.0, page.rect.width - text_width - 24.0) - y_pos = page.rect.height - 12.0 - page.insert_text( - (x_pos, y_pos), - WATERMARK_TEXT, - fontsize=8, - fontname="helv", - color=(0.72, 0.72, 0.72), + _insert_watermark_text( + page, + (link_x, baseline_y), + WATERMARK_LINK_TEXT, + fontname=font_config["link_fontname"], + fontsize=WATERMARK_FONT_SIZE, + color=WATERMARK_LINK_COLOR, + fontfile=font_config["link_fontfile"], + ) + + underline_y = min(page.rect.height - 1.0, baseline_y + 1.0) + page.draw_line( + (link_x, underline_y), + (link_x + link_width, underline_y), + color=WATERMARK_LINK_COLOR, + width=0.8, + overlay=True, + ) + page.insert_link( + { + "kind": pymupdf.LINK_URI, + "from": pymupdf.Rect( + link_x, + baseline_y - WATERMARK_FONT_SIZE, + link_x + link_width, + min(page.rect.height, baseline_y + 2.0), + ), + "uri": WATERMARK_URL, + } ) From 8d41f7e49501931ea08df310de5d58ac221eed17 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 17:16:48 +0000 Subject: [PATCH 14/28] =?UTF-8?q?=E2=9C=85=20Add=20integration=20tests=20f?= =?UTF-8?q?or=20merging=20fragmented=20numeric=20labels=20and=20excluding?= =?UTF-8?q?=20null=20alt=20attributes=20in=20PDF=20anonymization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 163 +++++++++++++++++- 1 file changed, 161 insertions(+), 2 deletions(-) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 87f515e..d064329 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -230,8 +230,8 @@ def test_should_disambiguate_and_persist_paragraphs( ): mock_build_canonical_entities.return_value = [] mock_get_canonical_dates.return_value = [] - mock_map_canonical_entities.side_effect = ( - lambda predictions, canonical_entities: predictions + mock_map_canonical_entities.side_effect = lambda predictions, canonical_entities: ( + predictions ) text = "Ana Pérez denunció en el juzgado." @@ -343,6 +343,165 @@ def fake_convert(*args, **kwargs): assert len(response.content) > 0 +@pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline") +def test_should_merge_fragmented_numeric_labels_in_predict_response( + mock_load_pipeline, client +): + mock_pipeline = MagicMock() + mock_pipeline.preprocess.return_value = [ + {"path": "empty", "data": {"doc.text": "REGISTRO NRO. 1 / 2025"}} + ] + mock_pipeline.predict_single.return_value = { + "data": {"doc.text": "REGISTRO NRO. 1 / 2025"}, + "predictions": { + "entities": [ + { + "text": "1", + "start_char": 14, + "end_char": 15, + "attrs": {"aymurai_label": "NUM_ACTUACION"}, + }, + { + "text": "2025", + "start_char": 18, + "end_char": 22, + "attrs": {"aymurai_label": "NUM_ACTUACION"}, + }, + ] + }, + } + mock_pipeline.postprocess.return_value = [mock_pipeline.predict_single.return_value] + mock_load_pipeline.return_value = mock_pipeline + + response = client.post( + "/anonymizer/predict", + json={"text": "REGISTRO NRO. 1 / 2025"}, + params={"use_cache": False}, + ) + + assert response.status_code == 200 + data = response.json() + assert len(data["labels"]) == 1 + assert data["labels"][0]["text"] == "1 / 2025" + assert data["labels"][0]["start_char"] == 14 + assert data["labels"][0]["end_char"] == 22 + + +@pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") +def test_should_merge_fragmented_labels_before_pdf_anonymization( + mock_get_anonymizer, client, tmp_path +): + anonymized_path = str(tmp_path / "output.pdf") + with open(anonymized_path, "wb") as f: + f.write(b"%PDF-1.4\n") + + mock_anonymizer = MagicMock(return_value=anonymized_path) + mock_get_anonymizer.return_value = mock_anonymizer + + first = build_label("NUM_ACTUACION", "1").model_dump(mode="json") + first["start_char"] = 14 + first["end_char"] = 15 + second = build_label("NUM_ACTUACION", "2025").model_dump(mode="json") + second["start_char"] = 16 + second["end_char"] = 20 + + annotations = { + "data": [ + { + "document": "REGISTRO NRO. 1/2025", + "labels": [first, second], + } + ], + "label_policies": {"NUM_ACTUACION": {"anonymize": True}}, + "render_policy": {"suffix_mode": "always", "suffix_threshold": 1}, + } + + response = client.post( + "/anonymizer/anonymize-document", + data={"annotations": json.dumps(annotations)}, + files={ + "file": ( + "sample.pdf", + b"%PDF-1.4\n", + "application/pdf", + ) + }, + ) + + assert response.status_code == 200 + preds = mock_anonymizer.call_args[0][1] + assert len(preds[0]["labels"]) == 1 + assert preds[0]["labels"][0]["text"] == "1/2025" + assert preds[0]["labels"][0]["start_char"] == 14 + assert preds[0]["labels"][0]["end_char"] == 20 + + attrs = preds[0]["labels"][0]["attrs"] + assert attrs["aymurai_alt_text"] == "1/2025" + assert attrs["aymurai_alt_start_char"] == 14 + assert attrs["aymurai_alt_end_char"] == 20 + + render_context = mock_anonymizer.call_args.kwargs["render_context"] + assert render_context["count_by_base"]["NUM_ACTUACION"] == 1 + assert render_context["index_by_entity"][("NUM_ACTUACION", "1/2025")] == 1 + + +@pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") +def test_should_exclude_null_alt_attrs_from_anonymize_document_preds( + mock_get_anonymizer, mock_check_output, client, tmp_path +): + anonymized_path = str(tmp_path / "output.docx") + with open(anonymized_path, "wb") as f: + f.write(b"fake-docx-content") + + mock_anonymizer = MagicMock(return_value=anonymized_path) + mock_get_anonymizer.return_value = mock_anonymizer + + def fake_convert(*args, **kwargs): + cmd = args[0] + source_path = cmd[-1] + output_path = source_path.rsplit(".", 1)[0] + ".odt" + with open(output_path, "wb") as output_file: + output_file.write(b"odt-content") + return "ok" + + mock_check_output.side_effect = fake_convert + annotations = { + "data": [ + { + "document": "Ana Perez denuncio en el juzgado.", + "labels": [build_label("PER", "Ana Perez").model_dump(mode="json")], + } + ], + "label_policies": {"PER": {"anonymize": True, "disambiguation": "fuzzy"}}, + "render_policy": {"suffix_mode": "auto", "suffix_threshold": 1}, + } + + response = client.post( + "/anonymizer/anonymize-document", + data={"annotations": json.dumps(annotations)}, + files={ + "file": ( + "sample.docx", + b"input-document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + }, + ) + + assert response.status_code == 200 + preds = mock_anonymizer.call_args[0][1] + assert preds[0]["labels"][0]["text"] == "Ana Perez" + + attrs = preds[0]["labels"][0]["attrs"] + assert "aymurai_alt_text" not in attrs + assert "aymurai_alt_start_char" not in attrs + assert "aymurai_alt_end_char" not in attrs + + @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") From e665edbb9b16d3f62bf034b2b061c62f190e56a5 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 17:17:21 +0000 Subject: [PATCH 15/28] =?UTF-8?q?=E2=9E=96=20Remove=20opencv-python-headle?= =?UTF-8?q?ss=20dependency=20from=20project=20requirements?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c572162..b5bb6d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,6 @@ dependencies = [ "sentence-transformers>=2.2.0", "pymupdf>=1.25.2", "pymupdf4llm>=0.0.17", - "opencv-python-headless>=4.5.0", "pypandoc>=1.15", "python-docx>=1.2.0", "docx2txt>=0.9", From 713e4ee17c50ff94600e23be58c889d2ef0feac0 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 21:46:11 +0000 Subject: [PATCH 16/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Implement=20paragrap?= =?UTF-8?q?h=20splitting=20function=20to=20enhance=20document=20text=20ext?= =?UTF-8?q?raction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../routers/misc/document_extract.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/aymurai/api/endpoints/routers/misc/document_extract.py b/aymurai/api/endpoints/routers/misc/document_extract.py index 56e4eaa..ba315b7 100644 --- a/aymurai/api/endpoints/routers/misc/document_extract.py +++ b/aymurai/api/endpoints/routers/misc/document_extract.py @@ -63,6 +63,20 @@ def run_safe_text_extraction( raise +def _split_document_paragraphs(document: str) -> list[str]: + if re.search(r"\n\s*\n+", document): + raw_paragraphs = re.split(r"\n\s*\n+", document) + else: + raw_paragraphs = document.splitlines() + + paragraphs = [ + re.sub(r"[ \t]{2,}", " ", paragraph.strip()) + for paragraph in raw_paragraphs + if paragraph.strip() + ] + return list(unique_justseen(paragraphs)) + + @router.post("/document-extract", response_model=Document) def plain_text_extractor(file: UploadFile) -> Document: """ @@ -111,13 +125,6 @@ def plain_text_extractor(file: UploadFile) -> Document: logger.info(f"removed temp file from local storage => {tmp_filename}") document_id = data_to_uuid(data) - - paragraphs = [ - paragraph.strip() - for paragraph in re.split(r"\n\s*\n+", document) - if paragraph.strip() - ] - paragraphs = [re.sub(r"[ \t]{2,}", " ", paragraph) for paragraph in paragraphs] - paragraphs = list(unique_justseen(paragraphs)) + paragraphs = _split_document_paragraphs(document) return Document(document=paragraphs, document_id=document_id) From ef3f672945258fe03a5188ab2f77a4a7eb89d288 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 21:47:09 +0000 Subject: [PATCH 17/28] =?UTF-8?q?=F0=9F=94=A7=20Update=20dependency=20inst?= =?UTF-8?q?allation=20command=20to=20prevent=20Python=20downloads?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 4636bc5..319cd7f 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -64,7 +64,7 @@ jobs: - name: Install dependencies run: | - uv sync --frozen --python python --no-dev --no-managed-python --group tests + uv sync --frozen --python python --no-dev --no-python-downloads --group tests - name: Run api tests env: From 78669143eb8573723fb7e11540ebc964f7596c99 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 21:56:21 +0000 Subject: [PATCH 18/28] =?UTF-8?q?=F0=9F=94=A5=20Remove=20redundant=20tests?= =?UTF-8?q?=20for=20merging=20fragmented=20numeric=20labels=20and=20PDF=20?= =?UTF-8?q?anonymization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 104 ------------------ 1 file changed, 104 deletions(-) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index d064329..2dd50d4 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -343,110 +343,6 @@ def fake_convert(*args, **kwargs): assert len(response.content) > 0 -@pytest.mark.integration -@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline") -def test_should_merge_fragmented_numeric_labels_in_predict_response( - mock_load_pipeline, client -): - mock_pipeline = MagicMock() - mock_pipeline.preprocess.return_value = [ - {"path": "empty", "data": {"doc.text": "REGISTRO NRO. 1 / 2025"}} - ] - mock_pipeline.predict_single.return_value = { - "data": {"doc.text": "REGISTRO NRO. 1 / 2025"}, - "predictions": { - "entities": [ - { - "text": "1", - "start_char": 14, - "end_char": 15, - "attrs": {"aymurai_label": "NUM_ACTUACION"}, - }, - { - "text": "2025", - "start_char": 18, - "end_char": 22, - "attrs": {"aymurai_label": "NUM_ACTUACION"}, - }, - ] - }, - } - mock_pipeline.postprocess.return_value = [mock_pipeline.predict_single.return_value] - mock_load_pipeline.return_value = mock_pipeline - - response = client.post( - "/anonymizer/predict", - json={"text": "REGISTRO NRO. 1 / 2025"}, - params={"use_cache": False}, - ) - - assert response.status_code == 200 - data = response.json() - assert len(data["labels"]) == 1 - assert data["labels"][0]["text"] == "1 / 2025" - assert data["labels"][0]["start_char"] == 14 - assert data["labels"][0]["end_char"] == 22 - - -@pytest.mark.integration -@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") -def test_should_merge_fragmented_labels_before_pdf_anonymization( - mock_get_anonymizer, client, tmp_path -): - anonymized_path = str(tmp_path / "output.pdf") - with open(anonymized_path, "wb") as f: - f.write(b"%PDF-1.4\n") - - mock_anonymizer = MagicMock(return_value=anonymized_path) - mock_get_anonymizer.return_value = mock_anonymizer - - first = build_label("NUM_ACTUACION", "1").model_dump(mode="json") - first["start_char"] = 14 - first["end_char"] = 15 - second = build_label("NUM_ACTUACION", "2025").model_dump(mode="json") - second["start_char"] = 16 - second["end_char"] = 20 - - annotations = { - "data": [ - { - "document": "REGISTRO NRO. 1/2025", - "labels": [first, second], - } - ], - "label_policies": {"NUM_ACTUACION": {"anonymize": True}}, - "render_policy": {"suffix_mode": "always", "suffix_threshold": 1}, - } - - response = client.post( - "/anonymizer/anonymize-document", - data={"annotations": json.dumps(annotations)}, - files={ - "file": ( - "sample.pdf", - b"%PDF-1.4\n", - "application/pdf", - ) - }, - ) - - assert response.status_code == 200 - preds = mock_anonymizer.call_args[0][1] - assert len(preds[0]["labels"]) == 1 - assert preds[0]["labels"][0]["text"] == "1/2025" - assert preds[0]["labels"][0]["start_char"] == 14 - assert preds[0]["labels"][0]["end_char"] == 20 - - attrs = preds[0]["labels"][0]["attrs"] - assert attrs["aymurai_alt_text"] == "1/2025" - assert attrs["aymurai_alt_start_char"] == 14 - assert attrs["aymurai_alt_end_char"] == 20 - - render_context = mock_anonymizer.call_args.kwargs["render_context"] - assert render_context["count_by_base"]["NUM_ACTUACION"] == 1 - assert render_context["index_by_entity"][("NUM_ACTUACION", "1/2025")] == 1 - - @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") From dd1153d19c1907e307d2e23f601fbb93b0555aea Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 21:56:41 +0000 Subject: [PATCH 19/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20anonymize?= =?UTF-8?q?r=20tests=20to=20use=20DOCX=20format=20and=20enhance=20mock=20f?= =?UTF-8?q?unctionality?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/api/routers/test_pipeline_flows.py | 35 ++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tests/api/routers/test_pipeline_flows.py b/tests/api/routers/test_pipeline_flows.py index 3df22aa..8d53952 100644 --- a/tests/api/routers/test_pipeline_flows.py +++ b/tests/api/routers/test_pipeline_flows.py @@ -1,9 +1,11 @@ +import io import json import shutil import uuid -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from docx import Document as DocxDocument from aymurai.database.schema import DataPublicDocumentParagraph from tests.api.routers.conftest import build_mock_pipeline @@ -20,6 +22,7 @@ def _fake_libreoffice_convert(*args, **kwargs): @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") @patch( "aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds" ) @@ -33,8 +36,10 @@ def test_should_run_anonymizer_flow_end_to_end( mock_build_canonical_entities, mock_get_canonical_dates, mock_map_canonical_entities, + mock_get_anonymizer, mock_check_output, client, + tmp_path, ): mock_extract.return_value = "Ana Pérez denunció.\nJuan Soto declaró." mock_load_pipeline.return_value = build_mock_pipeline() @@ -43,6 +48,12 @@ def test_should_run_anonymizer_flow_end_to_end( mock_map_canonical_entities.side_effect = lambda predictions, canonical_entities: ( predictions ) + + anonymized_path = str(tmp_path / "output.docx") + with open(anonymized_path, "wb") as f: + f.write(b"fake-docx-content") + mock_anonymizer = MagicMock(return_value=anonymized_path) + mock_get_anonymizer.return_value = mock_anonymizer mock_check_output.side_effect = _fake_libreoffice_convert extract_response = client.post( @@ -81,7 +92,13 @@ def test_should_run_anonymizer_flow_end_to_end( compile_response = client.post( "/anonymizer/anonymize-document", data={"annotations": json.dumps(annotations)}, - files={"file": ("sample.txt", b"doc-bytes", "text/plain")}, + files={ + "file": ( + "sample.docx", + b"doc-bytes", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + }, ) assert compile_response.status_code == 200 assert compile_response.headers["content-type"] == "application/octet-stream" @@ -162,10 +179,22 @@ def test_should_compile_anonymized_document_with_real_libreoffice_when_available "render_policy": {"suffix_mode": "auto", "suffix_threshold": 1}, } + doc = DocxDocument() + doc.add_paragraph("Texto base para anonimizar.") + buf = io.BytesIO() + doc.save(buf) + docx_bytes = buf.getvalue() + response = client.post( "/anonymizer/anonymize-document", data={"annotations": json.dumps(annotations)}, - files={"file": ("sample.txt", b"input-document", "text/plain")}, + files={ + "file": ( + "sample.docx", + docx_bytes, + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + }, ) assert response.status_code == 200 From c37ba349a075faf35122063b049cac45da3200fd Mon Sep 17 00:00:00 2001 From: jansaldo Date: Thu, 9 Apr 2026 22:07:29 +0000 Subject: [PATCH 20/28] =?UTF-8?q?=F0=9F=94=A7=20Add=20xfail=20marker=20for?= =?UTF-8?q?=20PDF=20extraction=20test=20on=20Windows=20due=20to=20tensor?= =?UTF-8?q?=20type=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/api/routers/misc/test_document_extract.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/api/routers/misc/test_document_extract.py b/tests/api/routers/misc/test_document_extract.py index 6a67fdd..124c4b9 100644 --- a/tests/api/routers/misc/test_document_extract.py +++ b/tests/api/routers/misc/test_document_extract.py @@ -1,5 +1,6 @@ import concurrent.futures import io +import sys from unittest.mock import patch import pytest @@ -74,6 +75,11 @@ def test_should_extract_real_text_from_sample_docx_without_mocking(client): @pytest.mark.integration @pytest.mark.slow +@pytest.mark.xfail( + sys.platform == "win32", + reason="pymupdf4llm ONNX layout model receives int32 tensors on Windows (expects int64)", + strict=False, +) def test_should_extract_real_text_from_pdf_without_mocking(client): """Test that a real PDF upload is extracted without mocking.""" expected_paragraphs = [ From 620540bf89060f6128feb624624eccacea871c7c Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 10 Apr 2026 14:43:32 +0000 Subject: [PATCH 21/28] =?UTF-8?q?=E2=9C=A8=20Enhance=20PDF=20anonymization?= =?UTF-8?q?=20by=20adding=20cleanup=20rects,=20removing=20overlapping=20li?= =?UTF-8?q?nks,=20and=20scrubbing=20metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/pdf.py | 131 +++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/aymurai/text/anonymization/pdf.py b/aymurai/text/anonymization/pdf.py index 50813c4..8a5f0f0 100644 --- a/aymurai/text/anonymization/pdf.py +++ b/aymurai/text/anonymization/pdf.py @@ -1581,6 +1581,128 @@ def _try_image_entity( return best +def _append_cleanup_rect( + cleanup_rects: dict[int, list[pymupdf.Rect]], + page_idx: int, + rect: pymupdf.Rect | tuple[float, float, float, float] | None, +) -> None: + if rect is None: + return + + cleanup_rect = pymupdf.Rect(rect) + if cleanup_rect.get_area() <= 0: + return + cleanup_rects.setdefault(page_idx, []).append(cleanup_rect) + + +def _cleanup_rect_for_page_op(op: dict[str, Any]) -> pymupdf.Rect | None: + if op.get("image_rect") is not None: + cleanup_rect = pymupdf.Rect(op["image_rect"]) + redact_rect = op.get("redact_rect") + if redact_rect is not None: + cleanup_rect.include_rect(pymupdf.Rect(redact_rect)) + return cleanup_rect + + cleanup_source = ( + op.get("redact_rect") or op.get("background_rect") or op.get("canvas_rect") + ) + if cleanup_source is None: + return None + return pymupdf.Rect(cleanup_source) + + +def _cleanup_rect_for_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: + widget_info = op.get("widget_info") or {} + widget_rect = widget_info.get("rect") + if widget_rect is None: + return None + return pymupdf.Rect(widget_rect) + + +def _cleanup_rect_for_signature_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: + widget_rect = op.get("widget_rect") + if widget_rect is not None: + return pymupdf.Rect(widget_rect) + + background_rect = op.get("background_rect") or op.get("canvas_rect") + if background_rect is None: + return None + return pymupdf.Rect(background_rect) + + +def _collect_link_cleanup_rects( + page_ops: dict[int, list[dict]], + widget_ops: dict[int, list[dict]], + signature_widget_ops: dict[int, list[dict]], +) -> dict[int, list[pymupdf.Rect]]: + cleanup_rects: dict[int, list[pymupdf.Rect]] = {} + + for page_idx, ops in page_ops.items(): + for op in ops: + _append_cleanup_rect(cleanup_rects, page_idx, _cleanup_rect_for_page_op(op)) + + for page_idx, ops in widget_ops.items(): + for op in ops: + _append_cleanup_rect( + cleanup_rects, + page_idx, + _cleanup_rect_for_widget_op(op), + ) + + for page_idx, ops in signature_widget_ops.items(): + for op in ops: + _append_cleanup_rect( + cleanup_rects, + page_idx, + _cleanup_rect_for_signature_widget_op(op), + ) + + return cleanup_rects + + +def _remove_overlapping_page_links( + doc: pymupdf.Document, + cleanup_rects: dict[int, list[pymupdf.Rect]], +) -> None: + for page_idx, page_rects in cleanup_rects.items(): + if not page_rects: + continue + + page = doc[page_idx] + for link in list(page.get_links()): + link_rect = link.get("from") + if link_rect is None: + continue + link_rect = pymupdf.Rect(link_rect) + if not any(link_rect.intersects(rect) for rect in page_rects): + continue + try: + page.delete_link(link) + except Exception as exc: + logger.warning( + "Failed to delete PDF link on page=%s rect=%s: %s", + page_idx, + tuple(round(value, 2) for value in link_rect), + exc, + ) + + +def _scrub_pdf_metadata(doc: pymupdf.Document) -> None: + doc.set_metadata( + { + "title": "", + "author": "", + "subject": "", + "keywords": "", + "creator": "", + "producer": "", + "creationDate": "", + "modDate": "", + "trapped": "", + } + ) + + def _apply_redactions( doc: pymupdf.Document, page_ops: dict[int, list[dict]], @@ -1909,10 +2031,17 @@ def anonymize( render_context, ) _apply_redactions(doc, page_ops, widget_ops, signature_widget_ops) + cleanup_rects = _collect_link_cleanup_rects( + page_ops, + widget_ops, + signature_widget_ops, + ) + _remove_overlapping_page_links(doc, cleanup_rects) + _scrub_pdf_metadata(doc) _add_footer_watermark(doc) os.makedirs(output_dir, exist_ok=True) output_path = Path(output_dir) / f"{file_path.stem}.anonymized.pdf" - doc.save(str(output_path)) + doc.save(str(output_path), garbage=4, clean=1, deflate=1) return str(output_path) From 9c11eb1feabf9121399c501f98a867e2552b2bd2 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 15:48:22 +0000 Subject: [PATCH 22/28] =?UTF-8?q?=F0=9F=94=A7=20Remove=20redundant=20retur?= =?UTF-8?q?n=20statement=20in=20=5Flabel=5Freplacement=5Ftext=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/alignment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aymurai/text/anonymization/alignment.py b/aymurai/text/anonymization/alignment.py index 5ca920a..49e4955 100644 --- a/aymurai/text/anonymization/alignment.py +++ b/aymurai/text/anonymization/alignment.py @@ -116,7 +116,6 @@ def _label_replacement_text(label: dict, document: str) -> str: start_char, end_char = int(alt_start), int(alt_end) if 0 <= start_char < end_char <= len(document): return document[start_char:end_char] - return "" start_char = int(label.get("start_char") or 0) end_char = int(label.get("end_char") or 0) From 435b305ee9a1826292dda93bd1644c555cea0ee0 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 17:05:42 +0000 Subject: [PATCH 23/28] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20anonymiza?= =?UTF-8?q?tion=20module:=20split=20pdf=20and=20docx=20internals=20by=20fo?= =?UTF-8?q?rmat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/settings.py | 4 + aymurai/text/anonymization/docx/__init__.py | 3 + .../{docx.py => docx/anonymizer.py} | 38 +- .../{watermarks.py => docx/watermark.py} | 0 .../{xml_docx.py => docx/xml.py} | 0 aymurai/text/anonymization/pdf.py | 2047 ----------------- aymurai/text/anonymization/pdf/__init__.py | 3 + aymurai/text/anonymization/pdf/anonymizer.py | 100 + aymurai/text/anonymization/pdf/common.py | 620 +++++ aymurai/text/anonymization/pdf/layout.py | 510 ++++ aymurai/text/anonymization/pdf/ops.py | 828 +++++++ aymurai/text/anonymization/pdf/sanitize.py | 294 +++ aymurai/text/anonymization/pdf/watermark.py | 315 +++ aymurai/text/anonymization/pdf/widgets.py | 323 +++ 14 files changed, 3035 insertions(+), 2050 deletions(-) create mode 100644 aymurai/text/anonymization/docx/__init__.py rename aymurai/text/anonymization/{docx.py => docx/anonymizer.py} (64%) rename aymurai/text/anonymization/{watermarks.py => docx/watermark.py} (100%) rename aymurai/text/anonymization/{xml_docx.py => docx/xml.py} (100%) delete mode 100644 aymurai/text/anonymization/pdf.py create mode 100644 aymurai/text/anonymization/pdf/__init__.py create mode 100644 aymurai/text/anonymization/pdf/anonymizer.py create mode 100644 aymurai/text/anonymization/pdf/common.py create mode 100644 aymurai/text/anonymization/pdf/layout.py create mode 100644 aymurai/text/anonymization/pdf/ops.py create mode 100644 aymurai/text/anonymization/pdf/sanitize.py create mode 100644 aymurai/text/anonymization/pdf/watermark.py create mode 100644 aymurai/text/anonymization/pdf/widgets.py diff --git a/aymurai/settings.py b/aymurai/settings.py index 3844d76..2079f56 100644 --- a/aymurai/settings.py +++ b/aymurai/settings.py @@ -65,6 +65,10 @@ def assemble_cors_origins(cls, v) -> list[str]: MEMORY_CACHE_TTL: int = 60 LIBREOFFICE_BIN: str = "libreoffice" + PDF_WATERMARK_FONT_REGULAR: str | None = None + PDF_WATERMARK_FONT_BOLD: str | None = None + ANONYMIZATION_METADATA_CREATOR: str = "AymurAI" + ANONYMIZATION_METADATA_PRODUCER: str = "AymurAI" # Disambiguation Config diff --git a/aymurai/text/anonymization/docx/__init__.py b/aymurai/text/anonymization/docx/__init__.py new file mode 100644 index 0000000..5d5d0ac --- /dev/null +++ b/aymurai/text/anonymization/docx/__init__.py @@ -0,0 +1,3 @@ +from aymurai.text.anonymization.docx.anonymizer import DocxAnonymizer + +__all__ = ["DocxAnonymizer"] diff --git a/aymurai/text/anonymization/docx.py b/aymurai/text/anonymization/docx/anonymizer.py similarity index 64% rename from aymurai/text/anonymization/docx.py rename to aymurai/text/anonymization/docx/anonymizer.py index 1ea3b95..73c4348 100644 --- a/aymurai/text/anonymization/docx.py +++ b/aymurai/text/anonymization/docx/anonymizer.py @@ -1,9 +1,11 @@ import os import tempfile +from datetime import datetime, timezone from glob import glob from pathlib import Path from typing import Any +from docx import Document from more_itertools import flatten from aymurai.text.anonymization.alignment import ( @@ -15,8 +17,9 @@ InvalidDocumentAnonymizer, register_anonymizer, ) -from aymurai.text.anonymization.watermarks import add_footer_watermark -from aymurai.text.anonymization.xml_docx import ( +from aymurai.text.anonymization.docx.watermark import add_footer_watermark +from aymurai.settings import settings +from aymurai.text.anonymization.docx.xml import ( create_docx, replace_text_in_xml, unzip_document, @@ -24,6 +27,21 @@ from aymurai.utils.cache import cache_load, cache_save, get_cache_key +def _set_aymurai_core_properties(doc_path: str) -> None: + """ + Applies the configured AymurAI tooling metadata fields to the DOCX core properties. + + Args: + doc_path (str): The path to the DOCX document to update. + """ + document = Document(doc_path) + core_properties = document.core_properties + core_properties.author = "" + core_properties.last_modified_by = settings.ANONYMIZATION_METADATA_CREATOR + core_properties.modified = datetime.now(timezone.utc) + document.save(doc_path) + + @register_anonymizer class DocxAnonymizer(BaseAnonymizer): """ @@ -42,6 +60,19 @@ def anonymize( output_dir: str = ".", render_context: dict[str, Any] | None = None, ) -> str: + """ + Anonymizes a DOCX document using the matched paragraph predictions. + + Args: + item (dict): The item dictionary containing the input DOCX path. + preds (list[dict]): The predictions to apply to the document. + output_dir (str, optional): The directory where the anonymized document should be written. Defaults to '.'. + render_context (dict[str, Any] | None, optional): The rendering context used to resolve replacement tokens. + Defaults to None. + + Returns: + str: The path to the anonymized DOCX output file. + """ item_path = Path(item["path"]) file_path = self.ensure_file(item_path) @@ -81,7 +112,8 @@ def anonymize( output_path = f"{output_dir}/{os.path.basename(str(file_path))}" create_docx(tempdir, output_path) - # Add watermark to the footer + # Add metadata branding and the footer watermark + _set_aymurai_core_properties(output_path) add_footer_watermark(output_path) if self.use_cache: diff --git a/aymurai/text/anonymization/watermarks.py b/aymurai/text/anonymization/docx/watermark.py similarity index 100% rename from aymurai/text/anonymization/watermarks.py rename to aymurai/text/anonymization/docx/watermark.py diff --git a/aymurai/text/anonymization/xml_docx.py b/aymurai/text/anonymization/docx/xml.py similarity index 100% rename from aymurai/text/anonymization/xml_docx.py rename to aymurai/text/anonymization/docx/xml.py diff --git a/aymurai/text/anonymization/pdf.py b/aymurai/text/anonymization/pdf.py deleted file mode 100644 index 8a5f0f0..0000000 --- a/aymurai/text/anonymization/pdf.py +++ /dev/null @@ -1,2047 +0,0 @@ -from __future__ import annotations - -import os -import re -from copy import deepcopy -from functools import lru_cache -from pathlib import Path -from typing import Any -from unicodedata import normalize - -import pymupdf -import pymupdf.layout # noqa: F401 # activates layout support -from jiwer import cer -from pymupdf4llm.helpers import document_layout as pymupdf4llm_document_layout - -from aymurai.logger import get_logger -from aymurai.text.anonymization.alignment import resolve_render_token -from aymurai.text.anonymization.base import ( - BaseAnonymizer, - InvalidDocumentAnonymizer, - register_anonymizer, -) - -logger = get_logger(__name__) - -WATERMARK_PREFIX_TEXT = "Documento anonimizado por " -WATERMARK_LINK_TEXT = "AymurAI" -WATERMARK_TEXT = f"{WATERMARK_PREFIX_TEXT}{WATERMARK_LINK_TEXT}" -WATERMARK_URL = "https://www.aymurai.info/" -WATERMARK_FONT_FAMILY = "Archivo" -WATERMARK_FONT_SIZE = 10.0 -WATERMARK_MARGIN_X = 24.0 -WATERMARK_BASELINE_MARGIN = 12.0 -WATERMARK_TEXT_COLOR = tuple(channel / 255 for channel in (192, 192, 192)) -WATERMARK_LINK_COLOR = tuple(channel / 255 for channel in (115, 190, 250)) - -TEXT_FLAG_ITALIC = 2 -TEXT_FLAG_SERIF = 4 -TEXT_FLAG_MONOSPACED = 8 -TEXT_FLAG_BOLD = 16 -PDF_TAG_MIN_FONT_SIZE = 7.0 -PDF_TAG_FONT_STEP = 0.5 -PDF_TAG_MAX_ABBREVIATION = 3 -PDF_TOKEN_ALIAS_MAP: dict[str, tuple[str, str]] = { - "CORREO_ELECTRONICO": ("CORREO", "MAIL"), - "CUIT_CUIL": ("CUIT", "CUIL"), - "DIRECCION": ("DIREC", "DIR"), - "ESTUDIOS": ("ESTUD", "EDU"), - "MARCA_AUTOMOVIL": ("MARCA_AUTO", "AUTO"), - "NACIONALIDAD": ("NACIONAL", "NAC"), - "NOMBRE_ARCHIVO": ("NOM_ARCH", "ARCH"), - "NUM_ACTUACION": ("NUM_ACT", "ACT"), - "NUM_CAJA_AHORRO": ("NUM_CAJA", "CAJA"), - "NUM_EXPEDIENTE": ("NUM_EXP", "EXPTE"), - "NUM_MATRICULA": ("NUM_MAT", "MAT"), - "PATENTE_DOMINIO": ("PAT_DOM", "PAT"), - "TELEFONO": ("TELEF", "TEL"), - "TEXTO_ANONIMIZAR": ("TEXTO_ANON", "ANON"), - "USUARIX": ("USUAR", "USR"), -} -PDF_TAG_RECT_X_PADDING = 0.5 -PDF_TAG_RECT_Y_PADDING = 0.0 -PDF_TAG_RECT_INSET = 0.5 -PDF_TAG_RECT_GAP_FACTOR = 0.5 -PDF_TAG_RECT_GAP_MIN = 3.0 -PDF_TAG_RECT_GAP_MAX = 8.0 - -# Vertical overlap ratio required to consider two image rects as matching -_IMAGE_OVERLAP_THRESHOLD = 0.3 - - -def _line_text(line: dict) -> str: - return "".join(span.get("text", "") for span in line.get("spans", [])) - - -def _rect_tuple(value: Any) -> tuple[float, float, float, float]: - if isinstance(value, pymupdf.Rect): - return (float(value.x0), float(value.y0), float(value.x1), float(value.y1)) - if isinstance(value, (list, tuple)) and len(value) == 4: - return (float(value[0]), float(value[1]), float(value[2]), float(value[3])) - raise ValueError(f"Invalid rectangle value: {value}") - - -def _default_style(fallback_size: float = 10.0) -> dict[str, Any]: - return { - "font": "", - "flags": 0, - "color": (0.0, 0.0, 0.0), - "size": fallback_size, - "ascender": 0.8, - "descender": -0.2, - } - - -def _span_text_weight(span: dict) -> tuple[int, float]: - text = str(span.get("text") or "").strip() - return (len(text), float(span.get("size") or 0.0)) - - -def _pdf_color_from_span(span: dict) -> tuple[float, float, float]: - try: - return tuple( - float(value) for value in pymupdf.sRGB_to_pdf(int(span.get("color") or 0)) - ) - except Exception: - return (0.0, 0.0, 0.0) - - -def _line_style(line: dict, fallback_size: float = 10.0) -> dict[str, Any]: - spans = [ - span for span in line.get("spans") or [] if str(span.get("text") or "").strip() - ] - if not spans: - return _default_style(fallback_size) - - dominant = max(spans, key=_span_text_weight) - return { - "font": str(dominant.get("font") or ""), - "flags": int(dominant.get("flags") or 0), - "color": _pdf_color_from_span(dominant), - "size": float(dominant.get("size") or fallback_size), - "ascender": float(dominant.get("ascender") or 0.8), - "descender": float(dominant.get("descender") or -0.2), - } - - -def _build_spans_detail(line: dict) -> tuple[list[dict], int]: - """Build per-span style info with character offsets for entity-level - style lookup. Returns ``(spans_detail, strip_offset)``.""" - raw_text = normalize("NFKC", _line_text(line)) - strip_offset = len(raw_text) - len(raw_text.lstrip()) - - spans_detail: list[dict] = [] - cursor = 0 - for span in line.get("spans", []): - span_text = normalize("NFKC", span.get("text", "")) - span_start = cursor - cursor += len(span_text) - spans_detail.append( - { - "start": span_start, - "end": cursor, - "style": { - "font": str(span.get("font") or ""), - "flags": int(span.get("flags") or 0), - "color": _pdf_color_from_span(span), - "size": float(span.get("size") or 10.0), - "ascender": float(span.get("ascender") or 0.8), - "descender": float(span.get("descender") or -0.2), - }, - } - ) - return spans_detail, strip_offset - - -def _entity_style_from_spans( - line_entry: dict, - offset_in_stripped_text: int, -) -> dict[str, Any]: - """Return the style of the span at *offset_in_stripped_text* within the - line entry's (stripped) text. Falls back to line-level dominant style.""" - spans_detail = line_entry.get("spans_detail") - if not spans_detail: - return line_entry.get("style") or _default_style() - - strip_offset = line_entry.get("strip_offset", 0) - raw_offset = offset_in_stripped_text + strip_offset - - for span_info in spans_detail: - if span_info["start"] <= raw_offset < span_info["end"]: - return span_info["style"] - - return line_entry.get("style") or _default_style() - - -def _font_size(line: dict, fallback: float = 10.0) -> float: - spans = line.get("spans") or [] - sizes = [float(span.get("size")) for span in spans if span.get("size")] - if not sizes: - return fallback - size = sum(sizes) / len(sizes) - return max(size * 0.9, PDF_TAG_MIN_FONT_SIZE) - - -def _style_flags(style: dict[str, Any]) -> tuple[bool, bool, bool, bool]: - flags = int(style.get("flags") or 0) - font_label = str(style.get("font") or "").lower() - - is_bold = bool(flags & TEXT_FLAG_BOLD) or "bold" in font_label - is_italic = bool(flags & TEXT_FLAG_ITALIC) or any( - token in font_label for token in ("italic", "oblique") - ) - is_mono = bool(flags & TEXT_FLAG_MONOSPACED) or any( - token in font_label for token in ("courier", "mono", "console") - ) - is_serif = bool(flags & TEXT_FLAG_SERIF) or any( - token in font_label - for token in ("times", "serif", "georgia", "garamond", "mistral") - ) - return is_bold, is_italic, is_mono, is_serif - - -def _base14_fontname_for_style(style: dict[str, Any]) -> str: - """Return a Base-14 font name based on detected style flags.""" - is_bold, is_italic, is_mono, is_serif = _style_flags(style) - - if is_mono: - family = "Courier" - elif is_serif: - family = "Times" - else: - family = "Helvetica" - - variants = { - ("Helvetica", False, False): "Helvetica", - ("Helvetica", True, False): "Helvetica-Bold", - ("Helvetica", False, True): "Helvetica-Oblique", - ("Helvetica", True, True): "Helvetica-BoldOblique", - ("Times", False, False): "Times-Roman", - ("Times", True, False): "Times-Bold", - ("Times", False, True): "Times-Italic", - ("Times", True, True): "Times-BoldItalic", - ("Courier", False, False): "Courier", - ("Courier", True, False): "Courier-Bold", - ("Courier", False, True): "Courier-Oblique", - ("Courier", True, True): "Courier-BoldOblique", - } - return variants[(family, is_bold, is_italic)] - - -def _build_flexible_pattern(text: str) -> str: - tokens = [re.escape(tok) for tok in re.split(r"\s+", text.strip()) if tok] - return r"\s+".join(tokens) - - -def _find_flexible( - haystack: str, - needle: str, - start: int = 0, -) -> tuple[int, int] | None: - if not needle: - return None - - idx = haystack.find(needle, start) - if idx >= 0: - return idx, idx + len(needle) - - pattern = _build_flexible_pattern(needle) - if not pattern: - return None - - match = re.search(pattern, haystack[start:]) - if match: - return start + match.start(), start + match.end() - - if start > 0: - match = re.search(pattern, haystack) - if match: - return match.start(), match.end() - - return None - - -def _label_start(label: dict) -> int: - attrs = label.get("attrs") or {} - alt = attrs.get("aymurai_alt_start_char") - start = label.get("start_char") - return int(alt if alt is not None else (start or 0)) - - -def _label_end(label: dict) -> int: - attrs = label.get("attrs") or {} - alt = attrs.get("aymurai_alt_end_char") - end = label.get("end_char") - return int(alt if alt is not None else (end or 0)) - - -def _label_surface_text(label: dict, document: str) -> str: - attrs = label.get("attrs") or {} - - # Prefer explicit alt text when it has an actual value. - alt_text = attrs.get("aymurai_alt_text") - if alt_text is not None: - return str(alt_text) if alt_text else "" - - # Use alt char offsets when available - alt_start = attrs.get("aymurai_alt_start_char") - alt_end = attrs.get("aymurai_alt_end_char") - - if alt_start is not None and alt_end is not None: - start, end = int(alt_start), int(alt_end) - if 0 <= start < end <= len(document): - return document[start:end] - return "" - - # No alt info available; use raw char offsets - start = int(label.get("start_char") or 0) - end = int(label.get("end_char") or 0) - if 0 <= start < end <= len(document): - return document[start:end] - - text = label.get("text") - return str(text) if text else "" - - -def _same_boundary_candidate(left: dict, right: dict) -> bool: - left_attrs = left.get("attrs") or {} - right_attrs = right.get("attrs") or {} - - if left_attrs.get("aymurai_label") != right_attrs.get("aymurai_label"): - return False - - left_cid = left_attrs.get("canonical_entity_id") - right_cid = right_attrs.get("canonical_entity_id") - if left_cid and right_cid and str(left_cid) != str(right_cid): - return False - - left_text = str(left.get("text") or "").strip() - right_text = str(right.get("text") or "").strip() - return bool(left_text and right_text) - - -def _resolve_token(label: dict, render_context: dict[str, Any] | None) -> str: - boundary_token = label.get("_boundary_token") - if boundary_token: - return boundary_token - - token = resolve_render_token(label, render_context) - return token or "ENT" - - -def _token_parts(token: str) -> tuple[str, str | None]: - match = re.match(r"^(.*?)(?:_(\d+))?$", token) - if not match: - normalized = token.strip() or "ENT" - return normalized, None - - base = match.group(1).strip() or "ENT" - suffix = match.group(2) - return base, suffix - - -def _abbreviate_token(base: str, length: int) -> str: - normalized = "".join(char for char in base.upper() if char.isalnum()) - if not normalized: - normalized = "ENT" - return normalized[:length] or normalized[:1] or "E" - - -def _token_aliases(base: str) -> tuple[str, ...]: - aliases = PDF_TOKEN_ALIAS_MAP.get(base.upper(), ()) - normalized_aliases: list[str] = [] - - for alias in aliases: - normalized = re.sub(r"[^A-Z0-9_]", "", str(alias).upper()) - if ( - normalized - and normalized != base.upper() - and normalized not in normalized_aliases - ): - normalized_aliases.append(normalized) - - return tuple(normalized_aliases) - - -def _build_display_token_candidates(token: str) -> list[str]: - base, suffix = _token_parts(token.upper()) - candidates: list[str] = [] - - def add(value: str) -> None: - if value and value not in candidates: - candidates.append(value) - - def add_base_variants(label: str) -> None: - if suffix: - add(f"<{label}_{suffix}>") - add(f"<{label}>") - - add_base_variants(base) - - for alias in _token_aliases(base): - add_base_variants(alias) - - abbreviated = _abbreviate_token(base, PDF_TAG_MAX_ABBREVIATION) - add_base_variants(abbreviated) - - return candidates - - -def _iter_font_sizes(start_size: float) -> list[float]: - if start_size <= 0: - return [] - - sizes: list[float] = [start_size] - current = start_size - while current - PDF_TAG_FONT_STEP >= PDF_TAG_MIN_FONT_SIZE - 1e-6: - current = round(current - PDF_TAG_FONT_STEP, 2) - if current not in sizes: - sizes.append(current) - - return sizes - - -def _fit_display_token( - token: str, - rect: pymupdf.Rect, - fontname: str, - base_font_size: float, - font_obj: pymupdf.Font | None = None, -) -> tuple[str | None, float | None]: - """Find the best display candidate that fits inside *rect*. - - When *font_obj* is provided its ``text_length`` method is used for pixel- - accurate measurement; otherwise the Base-14 ``pymupdf.get_text_length`` - function is used as a fallback. - """ - if rect.width <= 0 or rect.height <= 0: - return None, None - - available_width = max(rect.width - (2 * PDF_TAG_RECT_INSET), 1.0) - start_size = min(base_font_size, max(rect.height - 1.0, 1.0)) - if start_size < 1.0: - return None, None - - def _measure(text: str, size: float) -> float: - if font_obj is not None: - try: - return font_obj.text_length(text, fontsize=size) - except Exception: - pass - return pymupdf.get_text_length(text, fontname=fontname, fontsize=size) - - for size in _iter_font_sizes(start_size): - for candidate in _build_display_token_candidates(token): - if _measure(candidate, size) <= available_width + 0.1: - return candidate, size - - return None, None - - -# Cache of Base-14 pymupdf.Font objects (they are reusable and thread-safe). -_BASE14_FONT_CACHE: dict[str, pymupdf.Font] = {} - - -def _get_base14_font(style: dict[str, Any]) -> pymupdf.Font: - """Return a ``pymupdf.Font`` built from the Base-14 name that matches - *style*. The object is cached so repeated calls are essentially free. - - Base-14 fonts always contain the full Latin character set (including - ``<``, ``>``, ``_``, digits) and correctly carry bold / italic weight, - unlike subset font buffers extracted from the PDF.""" - name = _base14_fontname_for_style(style) - font = _BASE14_FONT_CACHE.get(name) - if font is None: - font = pymupdf.Font(name) - _BASE14_FONT_CACHE[name] = font - return font - - -def _apply_minimal_boundary_merge( - paragraphs: list[dict], - render_context: dict[str, Any] | None, -) -> None: - for left_par, right_par in zip(paragraphs, paragraphs[1:]): - left_doc = left_par.get("document") or "" - right_doc = right_par.get("document") or "" - left_labels = left_par.get("labels") or [] - right_labels = right_par.get("labels") or [] - - if not left_doc or not right_doc or not left_labels or not right_labels: - continue - - left_candidates = [ - label - for label in left_labels - if _label_end(label) >= max(0, len(left_doc) - 2) - ] - right_candidates = [label for label in right_labels if _label_start(label) <= 2] - - if not left_candidates or not right_candidates: - continue - - for left_label in left_candidates: - for right_label in right_candidates: - if not _same_boundary_candidate(left_label, right_label): - continue - - shared_token = _resolve_token(left_label, render_context) - if not shared_token: - shared_token = _resolve_token(right_label, render_context) - if shared_token: - left_label["_boundary_token"] = shared_token - right_label["_boundary_token"] = shared_token - break - - -def _build_layout_paragraphs(parsed_doc: Any) -> list[dict]: - chunks = parsed_doc.to_text( - page_chunks=True, - header=True, - footer=True, - show_progress=False, - ) - - paragraphs: list[dict] = [] - layout_index = 0 - for page_idx, (page, chunk) in enumerate(zip(parsed_doc.pages, chunks)): - page_text = chunk.get("text") or "" - page_boxes = chunk.get("page_boxes") or [] - - for box_meta in page_boxes: - box_idx = int(box_meta["index"]) - if box_idx >= len(page.boxes): - continue - - start, stop = box_meta.get("pos", (0, 0)) - box_text = normalize("NFKC", page_text[start:stop]).strip() - if not box_text: - continue - - box = page.boxes[box_idx] - line_entries: list[dict] = [] - line_text_chunks: list[str] = [] - line_cursor = 0 - - for line_idx, line in enumerate(box.textlines or []): - text = normalize("NFKC", _line_text(line)).strip() - if not text: - continue - - if line_text_chunks: - line_text_chunks.append("\n") - line_cursor += 1 - - line_start = line_cursor - line_text_chunks.append(text) - line_cursor += len(text) - line_end = line_cursor - style = _line_style(line) - spans_detail, strip_offset = _build_spans_detail(line) - - line_entries.append( - { - "page_index": page_idx, - "box_index": box_idx, - "line_index": line_idx, - "bbox": _rect_tuple(line["bbox"]), - "font_size": _font_size(line, float(style.get("size") or 10.0)), - "start": line_start, - "end": line_end, - "text": text, - "style": style, - "spans_detail": spans_detail, - "strip_offset": strip_offset, - } - ) - - line_text = "".join(line_text_chunks) - if not line_text: - continue - - paragraphs.append( - { - "plain_text": box_text, - "metadata": { - "layout_index": layout_index, - "page_index": page_idx, - "page_number": page.page_number, - "box_index": box_idx, - "boxclass": box.boxclass, - "box_bbox": ( - float(box.x0), - float(box.y0), - float(box.x1), - float(box.y1), - ), - "line_text": line_text, - "lines": line_entries, - }, - } - ) - layout_index += 1 - - return paragraphs - - -def _match_predictions_to_layout( - layout_paragraphs: list[dict], - preds: list[dict], -) -> list[dict]: - if not layout_paragraphs or not preds: - return [] - - available_indices = list(range(len(layout_paragraphs))) - all_indices = list(range(len(layout_paragraphs))) - matched: list[dict] = [] - - normalized_layout_texts = [ - normalize("NFKC", paragraph["plain_text"]).strip() - for paragraph in layout_paragraphs - ] - - for pred_idx, pred in enumerate(preds): - pred_text = normalize("NFKC", str(pred.get("document") or "")).strip() - if not pred_text: - continue - - candidate_pool = available_indices if available_indices else all_indices - exact_idx = next( - ( - idx - for idx in candidate_pool - if normalized_layout_texts[idx] == pred_text - ), - None, - ) - - if exact_idx is None: - exact_idx = min( - candidate_pool, - key=lambda idx: cer(pred_text, normalized_layout_texts[idx]), - ) - - paragraph = deepcopy(layout_paragraphs[exact_idx]) - paragraph["document"] = pred.get("document") or "" - paragraph["labels"] = pred.get("labels") or [] - paragraph["pred_index"] = pred_idx - matched.append(paragraph) - - if exact_idx in available_indices: - available_indices.remove(exact_idx) - - matched.sort(key=lambda paragraph: paragraph["metadata"]["layout_index"]) - return matched - - -def _rect_vertical_overlap(left: pymupdf.Rect, right: pymupdf.Rect) -> float: - overlap = max(0.0, min(left.y1, right.y1) - max(left.y0, right.y0)) - min_height = max(min(left.height, right.height), 1e-6) - return overlap / min_height - - -def _group_adjacent_rects( - rects: list[pymupdf.Rect], max_gap: float -) -> list[pymupdf.Rect]: - if not rects: - return [] - - ordered = sorted(rects, key=lambda rect: (rect.y0, rect.x0, rect.x1)) - groups: list[list[pymupdf.Rect]] = [[ordered[0]]] - - for rect in ordered[1:]: - previous = groups[-1][-1] - gap = rect.x0 - previous.x1 - if _rect_vertical_overlap(previous, rect) >= 0.5 and gap <= max_gap: - groups[-1].append(rect) - else: - groups.append([rect]) - - merged_rects: list[pymupdf.Rect] = [] - for group in groups: - merged = pymupdf.Rect(group[0]) - for rect in group[1:]: - merged.include_rect(rect) - merged_rects.append(merged) - - return merged_rects - - -def _pick_rect_group_for_segment( - page: pymupdf.Page, - line: dict, - text: str, - line_x_cursor: dict[tuple[int, int, int], float], -) -> pymupdf.Rect: - clip = pymupdf.Rect(line["bbox"]) - rects = [rect for rect in page.search_for(text, clip=clip) if rect.intersects(clip)] - if not rects: - return clip - - max_gap = min( - max(clip.height * PDF_TAG_RECT_GAP_FACTOR, PDF_TAG_RECT_GAP_MIN), - PDF_TAG_RECT_GAP_MAX, - ) - grouped_rects = _group_adjacent_rects(rects, max_gap=max_gap) - - line_key = (line["page_index"], line["box_index"], line["line_index"]) - min_x = line_x_cursor.get(line_key, clip.x0 - 1) - - for rect in grouped_rects: - if rect.x0 >= min_x - 0.5: - line_x_cursor[line_key] = rect.x1 - return rect - - chosen = grouped_rects[0] - line_x_cursor[line_key] = chosen.x1 - return chosen - - -def _padded_rect(rect: pymupdf.Rect, clip: pymupdf.Rect) -> pymupdf.Rect: - padded = pymupdf.Rect(rect) - padded.x0 = max(clip.x0, padded.x0 - PDF_TAG_RECT_X_PADDING) - padded.y0 = max(clip.y0, padded.y0 - PDF_TAG_RECT_Y_PADDING) - padded.x1 = min(clip.x1, padded.x1 + PDF_TAG_RECT_X_PADDING) - padded.y1 = min(clip.y1, padded.y1 + PDF_TAG_RECT_Y_PADDING) - return padded - - -def _render_rect(rect: pymupdf.Rect) -> pymupdf.Rect: - render_rect = pymupdf.Rect(rect) - inset = min(PDF_TAG_RECT_INSET, max(render_rect.height * 0.1, 0.0)) - render_rect.x0 += inset - render_rect.x1 -= inset - if render_rect.x1 <= render_rect.x0: - render_rect = pymupdf.Rect(rect) - return render_rect - - -def _text_redact_rect(rect: pymupdf.Rect) -> pymupdf.Rect: - redact_rect = pymupdf.Rect(rect) - edge_inset = min(0.25, max(redact_rect.width * 0.01, 0.05)) - if redact_rect.width > (2 * edge_inset): - redact_rect.x0 += edge_inset - redact_rect.x1 -= edge_inset - return redact_rect - - -def _normalize_line_chars(spans: list[dict]) -> list[dict[str, Any]]: - chars: list[dict[str, Any]] = [] - for span in spans: - for char in span.get("chars") or []: - norm_text = normalize("NFKC", str(char.get("c") or "")) - if not norm_text: - continue - bbox = pymupdf.Rect(char["bbox"]) - for norm_char in norm_text: - chars.append({"char": norm_char, "bbox": bbox}) - return chars - - -def _line_chars_from_page(page: pymupdf.Page, line: dict) -> list[dict[str, Any]]: - clip = pymupdf.Rect(line["bbox"]) - raw = page.get_text("rawdict", clip=clip) - target_text = normalize("NFKC", str(line.get("text") or "")).strip() - - best_chars: list[dict[str, Any]] = [] - best_score: tuple[float, float, float] | None = None - - for block in raw.get("blocks") or []: - if block.get("type", 0) != 0: - continue - for raw_line in block.get("lines") or []: - chars = _normalize_line_chars(raw_line.get("spans") or []) - if not chars: - continue - - candidate_rect = pymupdf.Rect(raw_line["bbox"]) - candidate_text = "".join(entry["char"] for entry in chars).strip() - overlap = ( - _rect_vertical_overlap(candidate_rect, clip) - if candidate_rect.intersects(clip) - else 0.0 - ) - text_score = 0.0 - if target_text or candidate_text: - text_score = ( - 0.0 - if target_text == candidate_text - else cer(target_text, candidate_text) - ) - bbox_score = ( - abs(candidate_rect.x0 - clip.x0) - + abs(candidate_rect.y0 - clip.y0) - + abs(candidate_rect.x1 - clip.x1) - + abs(candidate_rect.y1 - clip.y1) - ) / 100.0 - score = (1.0 - overlap, text_score, bbox_score) - if best_score is None or score < best_score: - best_score = score - best_chars = chars - - return best_chars - - -def _line_chars_text(chars: list[dict[str, Any]]) -> str: - return "".join(str(entry.get("char") or "") for entry in chars) - - -def _find_line_char_span( - chars: list[dict[str, Any]], - text: str, - *, - start: int = 0, - raw_text: str | None = None, -) -> tuple[int, int] | None: - """ - Match *text* against the raw character stream for a line. - - ``line["text"]`` comes from PyMuPDF layout text and can differ from the - raw character stream returned by ``rawdict``. Searching the raw stream - keeps the redaction rectangle aligned with the actual glyph boxes. - """ - if not chars or not text: - return None - - haystack = raw_text if raw_text is not None else _line_chars_text(chars) - pattern = _build_flexible_pattern(text) - - def _search(offset: int) -> tuple[int, int] | None: - exact_idx = haystack.find(text, offset) - flexible_span = None - if pattern: - match = re.search(pattern, haystack[offset:]) - if match is not None: - flexible_span = (offset + match.start(), offset + match.end()) - - if exact_idx < 0: - return flexible_span - exact_span = (exact_idx, exact_idx + len(text)) - if flexible_span is None: - return exact_span - return min(exact_span, flexible_span, key=lambda span: span[0]) - - span = _search(start) - if span is None and start > 0: - span = _search(0) - return span - - -def _rect_from_char_slice( - chars: list[dict[str, Any]], - start: int, - end: int, -) -> pymupdf.Rect | None: - if not chars: - return None - - slice_start = max(int(start), 0) - slice_end = min(int(end), len(chars)) - if slice_end <= slice_start: - return None - - segment = chars[slice_start:slice_end] - if not segment: - return None - - boxes = [entry["bbox"] for entry in segment if str(entry["char"]).strip()] - if not boxes: - boxes = [entry["bbox"] for entry in segment] - if not boxes: - return None - - rect = pymupdf.Rect(boxes[0]) - for bbox in boxes[1:]: - rect.include_rect(bbox) - return rect - - -def _build_page_op( - rect: pymupdf.Rect, - line: dict | None, - token: str, - is_image: bool = False, - entity_style: dict[str, Any] | None = None, -) -> dict[str, Any]: - line_clip = pymupdf.Rect(line["bbox"]) if line else pymupdf.Rect(rect) - canvas_rect = _padded_rect(rect, line_clip) - render_rect = _render_rect(canvas_rect) - style = entity_style or (line or {}).get("style") or _default_style() - base_font_size = float((line or {}).get("font_size") or style.get("size") or 10.0) - - # Always use Base-14 fonts: they carry correct bold/italic weight and - # contain all glyphs needed for tags (<, >, _, digits, letters). - # Subset font buffers extracted from the PDF lack many of these glyphs. - fontname = _base14_fontname_for_style(style) - font_obj = _get_base14_font(style) - - display_token, fitted_size = _fit_display_token( - token, - render_rect, - fontname, - base_font_size, - font_obj=font_obj, - ) - - if not display_token or fitted_size is None: - logger.warning( - "Could not fit PDF token '%s' inside rect=%s", - token, - tuple(round(value, 2) for value in canvas_rect), - ) - - return { - "redact_rect": _text_redact_rect(rect), - "background_rect": canvas_rect, - "canvas_rect": canvas_rect, - "render_rect": render_rect, - "line_rect": line_clip, - "text": display_token, - "logical_token": token, - "fontname": fontname, - "fontsize": fitted_size, - "text_align": pymupdf.TEXT_ALIGN_LEFT, - "text_color": style.get("color") or (0.0, 0.0, 0.0), - "style": style, - } - - -def _signature_background_rect( - op: dict[str, Any], - widget_rect: pymupdf.Rect, -) -> pymupdf.Rect: - background = pymupdf.Rect( - op.get("line_rect") or op.get("canvas_rect") or widget_rect - ) - canvas_rect = op.get("canvas_rect") - if canvas_rect is not None: - background.include_rect(pymupdf.Rect(canvas_rect)) - - pad_x = max(background.height * 0.75, 2.0) - pad_y = max(background.height * 0.25, 0.75) - widget_clip = pymupdf.Rect(widget_rect) - - background.x0 = max(widget_clip.x0, background.x0 - pad_x) - background.y0 = max(widget_clip.y0, background.y0 - pad_y) - background.x1 = min(widget_clip.x1, background.x1 + pad_x) - background.y1 = min(widget_clip.y1, background.y1 + pad_y) - return background - - -def _image_rects_for_clip( - page: pymupdf.Page, - clip: pymupdf.Rect, -) -> list[pymupdf.Rect]: - """Return bounding rectangles of images that overlap *clip*.""" - rects: list[pymupdf.Rect] = [] - for img_info in page.get_image_info(): - bbox = img_info.get("bbox") - if bbox is None: - continue - img_rect = pymupdf.Rect(bbox) - if img_rect.intersects(clip) and img_rect.get_area() > 0: - rects.append(img_rect) - return rects - - -def _entity_overlaps_image( - page: pymupdf.Page, - entity_rect: pymupdf.Rect, - image_rects: list[pymupdf.Rect], -) -> pymupdf.Rect | None: - """If *entity_rect* overlaps an image return the image rect, else None.""" - for img_rect in image_rects: - overlap = _rect_vertical_overlap(entity_rect, img_rect) - if overlap >= _IMAGE_OVERLAP_THRESHOLD and entity_rect.intersects(img_rect): - return img_rect - return None - - -def _widget_text_color(widget: pymupdf.Widget) -> tuple[float, float, float]: - values = list(widget.text_color or []) - if not values: - return (0.0, 0.0, 0.0) - if len(values) == 1: - shade = float(values[0]) - return (shade, shade, shade) - if len(values) >= 3: - return tuple(float(value) for value in values[:3]) - return (0.0, 0.0, 0.0) - - -def _style_from_widget(widget: pymupdf.Widget) -> dict[str, Any]: - return { - "font": str(widget.text_font or ""), - "flags": 0, - "color": _widget_text_color(widget), - "size": float(widget.text_fontsize or 10.0), - "ascender": 0.8, - "descender": -0.2, - } - - -def _page_widget_infos(page: pymupdf.Page) -> list[dict[str, Any]]: - infos: list[dict[str, Any]] = [] - for widget in page.widgets() or []: - if widget.field_type not in ( - pymupdf.PDF_WIDGET_TYPE_TEXT, - pymupdf.PDF_WIDGET_TYPE_SIGNATURE, - ): - continue - infos.append( - { - "xref": int(widget.xref), - "field_type": int(widget.field_type), - "field_name": str(widget.field_name or ""), - "field_value": str(widget.field_value or ""), - "rect": pymupdf.Rect(widget.rect), - "style": _style_from_widget(widget), - } - ) - return infos - - -def _entity_overlaps_widget( - entity_rect: pymupdf.Rect, - widget_infos: list[dict[str, Any]], -) -> dict[str, Any] | None: - best_widget: dict[str, Any] | None = None - best_area = 0.0 - for widget_info in widget_infos: - widget_rect = widget_info["rect"] - if not entity_rect.intersects(widget_rect): - continue - area = (entity_rect & widget_rect).get_area() - if area > best_area: - best_area = area - best_widget = widget_info - return best_widget - - -def _fit_widget_token( - widget_info: dict[str, Any], - current_text: str, - entity_span: tuple[int, int], - token: str, -) -> str: - style = widget_info.get("style") or _default_style() - rect = pymupdf.Rect(widget_info["rect"]) - font_obj = _get_base14_font(style) - max_width = max(rect.width - 1.0, 1.0) - - prefix = current_text[: entity_span[0]] - suffix = current_text[entity_span[1] :] - - for candidate in _build_display_token_candidates(token): - candidate_text = f"{prefix}{candidate}{suffix}" - if ( - font_obj.text_length( - candidate_text, fontsize=float(style.get("size") or 10.0) - ) - <= max_width + 0.1 - ): - return candidate - - candidates = _build_display_token_candidates(token) - return candidates[0] if candidates else f"<{token}>" - - -def _apply_widget_ops( - doc: pymupdf.Document, - widget_ops: dict[int, list[dict]], -) -> None: - for page_idx, ops in widget_ops.items(): - if not ops: - continue - - page = doc[page_idx] - widgets = { - int(widget.xref): widget - for widget in (page.widgets() or []) - if widget.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT - } - grouped: dict[int, list[dict]] = {} - for op in ops: - grouped.setdefault(int(op["widget_xref"]), []).append(op) - - for widget_xref, replacements in grouped.items(): - widget = widgets.get(widget_xref) - if widget is None: - logger.warning( - "Could not resolve PDF widget xref=%s on page=%s", - widget_xref, - page_idx, - ) - continue - - current_text = str(widget.field_value or "") - if not current_text: - continue - - search_cursor = 0 - changed = False - for replacement in replacements: - entity_text = replacement["entity_text"] - span = _find_flexible(current_text, entity_text, start=search_cursor) - if span is None: - span = _find_flexible(current_text, entity_text, start=0) - if span is None: - logger.warning( - "Could not map widget label '%s' in widget '%s' on page=%s", - entity_text, - replacement.get("field_name") or widget.field_name, - page_idx, - ) - continue - - token_text = _fit_widget_token( - replacement["widget_info"], - current_text, - span, - replacement["logical_token"], - ) - current_text = ( - f"{current_text[: span[0]]}{token_text}{current_text[span[1] :]}" - ) - search_cursor = span[0] + len(token_text) - changed = True - - if not changed: - continue - - try: - widget.field_value = current_text - widget.update() - except Exception as exc: - logger.warning( - "Failed to update PDF widget '%s' on page=%s: %s", - widget.field_name, - page_idx, - exc, - ) - - -def _apply_signature_widget_ops( - doc: pymupdf.Document, - signature_widget_ops: dict[int, list[dict]], -) -> None: - for page_idx, ops in signature_widget_ops.items(): - if not ops: - continue - - page = doc[page_idx] - widgets = { - int(widget.xref): widget - for widget in (page.widgets() or []) - if widget.field_type == pymupdf.PDF_WIDGET_TYPE_SIGNATURE - } - grouped: dict[int, list[dict]] = {} - for op in ops: - grouped.setdefault(int(op["widget_xref"]), []).append(op) - - for widget_xref, widget_group_ops in grouped.items(): - widget = widgets.get(widget_xref) - widget_rect = pymupdf.Rect( - widget_group_ops[0].get("widget_rect") or (0, 0, 0, 0) - ) - appearance_png: bytes | None = None - - if widget is not None: - widget_rect = pymupdf.Rect(widget.rect) - try: - scale = 200 / 72.0 - pix = page.get_pixmap( - clip=widget_rect, - matrix=pymupdf.Matrix(scale, scale), - alpha=False, - ) - appearance_png = pix.tobytes("png") - except Exception as exc: - logger.warning( - "Could not snapshot signature widget xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - - try: - page.delete_widget(widget) - except Exception as exc: - logger.warning( - "Failed to delete signature widget xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - appearance_png = None - else: - logger.warning( - "Could not resolve PDF signature widget xref=%s on page=%s", - widget_xref, - page_idx, - ) - - if appearance_png and widget_rect.get_area() > 0: - try: - page.insert_image(widget_rect, stream=appearance_png, overlay=True) - except Exception as exc: - logger.warning( - "Failed to restore signature widget appearance xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) - - for op in widget_group_ops: - op["background_rect"] = _signature_background_rect(op, widget_rect) - _render_text_op(page, op) - - -def _collect_page_redactions( - doc: pymupdf.Document, - paragraphs: list[dict], - render_context: dict[str, Any] | None, -) -> dict[int, list[dict]]: - page_ops: dict[int, list[dict]] = {} - widget_ops: dict[int, list[dict]] = {} - signature_widget_ops: dict[int, list[dict]] = {} - line_x_cursor: dict[tuple[int, int, int], float] = {} - line_char_cache: dict[tuple[int, int, int], list[dict[str, Any]]] = {} - line_char_text_cache: dict[tuple[int, int, int], str] = {} - line_char_cursor: dict[tuple[int, int, int], int] = {} - - # Pre-compute image rects and widgets per page - page_image_rects: dict[int, list[pymupdf.Rect]] = {} - page_widgets: dict[int, list[dict[str, Any]]] = {} - - for paragraph in paragraphs: - metadata = paragraph.get("metadata") or {} - lines = metadata.get("lines") or [] - if not lines: - continue - - page_index = int(metadata["page_index"]) - page = doc[page_index] - line_text = metadata.get("line_text") or "" - box_clip = pymupdf.Rect(metadata.get("box_bbox") or page.rect) - document = paragraph.get("document") or "" - labels = sorted(paragraph.get("labels") or [], key=_label_start) - search_cursor = 0 - - # Lazy-load image rects and widget infos for this page - if page_index not in page_image_rects: - page_image_rects[page_index] = _image_rects_for_clip(page, page.rect) - if page_index not in page_widgets: - page_widgets[page_index] = _page_widget_infos(page) - - for label in labels: - entity_text = _label_surface_text(label, document).strip() - if not entity_text: - # Fall back to raw label text only if alt processing was - # not applied (no alt attributes present at all). - attrs = label.get("attrs") or {} - alt_applied = any( - key in attrs - for key in ( - "aymurai_alt_text", - "aymurai_alt_start_char", - "aymurai_alt_end_char", - ) - ) - if not alt_applied: - entity_text = str(label.get("text") or "").strip() - if not entity_text: - continue - - token = _resolve_token(label, render_context) - - span = _find_flexible(line_text, entity_text, start=search_cursor) - if span is None: - span = _find_flexible(line_text, entity_text, start=0) - if span is None: - # -- Fallback: direct page search -- - fallback_rects = [ - rect - for rect in page.search_for(entity_text, clip=box_clip) - if rect.intersects(box_clip) - ] - - # Check if this is a widget-backed entity before falling back to images - if fallback_rects: - fallback_widget = _entity_overlaps_widget( - fallback_rects[0], - page_widgets[page_index], - ) - if fallback_widget is not None: - if ( - fallback_widget["field_type"] - == pymupdf.PDF_WIDGET_TYPE_TEXT - ): - widget_ops.setdefault(page_index, []).append( - { - "widget_xref": fallback_widget["xref"], - "field_name": fallback_widget["field_name"], - "widget_info": fallback_widget, - "entity_text": entity_text, - "logical_token": token, - } - ) - continue - if ( - fallback_widget["field_type"] - == pymupdf.PDF_WIDGET_TYPE_SIGNATURE - ): - op = _build_page_op( - fallback_rects[0], - lines[0] if lines else None, - token, - entity_style=fallback_widget.get("style") or None, - ) - op["widget_xref"] = fallback_widget["xref"] - op["widget_rect"] = fallback_widget["rect"] - signature_widget_ops.setdefault(page_index, []).append(op) - continue - - # Check if this is an image-based entity - if not fallback_rects: - img_match = _try_image_entity( - page, - entity_text, - box_clip, - page_image_rects[page_index], - ) - if img_match is not None: - op = _build_page_op( - img_match, - lines[0] if lines else None, - token, - is_image=True, - ) - op["image_rect"] = img_match - page_ops.setdefault(page_index, []).append(op) - continue - - if fallback_rects: - grouped_rects = _group_adjacent_rects( - fallback_rects, max_gap=PDF_TAG_RECT_GAP_MAX - ) - fallback_line = lines[0] if lines else None - - # Check if any of these rects overlap an image - for rect in grouped_rects: - img_rect = _entity_overlaps_image( - page, - rect, - page_image_rects[page_index], - ) - op = _build_page_op( - rect, - fallback_line, - token, - is_image=(img_rect is not None), - ) - if img_rect is not None: - op["image_rect"] = img_rect - page_ops.setdefault(page_index, []).append(op) - continue - - logger.warning( - "Could not map label '%s' on page=%s box=%s", - entity_text, - metadata.get("page_number"), - metadata.get("box_index"), - ) - continue - - search_cursor = span[1] - - # Collect line segments this entity spans - segments: list[ - tuple[ - dict, - str, - pymupdf.Rect, - pymupdf.Rect | None, - dict, - dict[str, Any] | None, - ] - ] = [] - for line in lines: - overlap_start = max(span[0], line["start"]) - overlap_end = min(span[1], line["end"]) - if overlap_end <= overlap_start: - continue - - segment_text = line_text[overlap_start:overlap_end].strip() - if not segment_text: - continue - - line_key = ( - line["page_index"], - line["box_index"], - line["line_index"], - ) - line_chars = line_char_cache.get(line_key) - if line_chars is None: - line_chars = _line_chars_from_page(page, line) - line_char_cache[line_key] = line_chars - - line_char_text = line_char_text_cache.get(line_key) - if line_char_text is None: - line_char_text = _line_chars_text(line_chars) - line_char_text_cache[line_key] = line_char_text - - raw_span = _find_line_char_span( - line_chars, - segment_text, - start=line_char_cursor.get(line_key, 0), - raw_text=line_char_text, - ) - rect = None - if raw_span is not None: - line_char_cursor[line_key] = raw_span[1] - rect = _rect_from_char_slice(line_chars, raw_span[0], raw_span[1]) - - if rect is None: - raw_start = ( - overlap_start - line["start"] + int(line.get("strip_offset", 0)) - ) - raw_end = ( - overlap_end - line["start"] + int(line.get("strip_offset", 0)) - ) - rect = _rect_from_char_slice(line_chars, raw_start, raw_end) - if rect is None: - rect = _pick_rect_group_for_segment( - page, - line, - segment_text, - line_x_cursor, - ) - - widget_info = _entity_overlaps_widget( - rect, - page_widgets[page_index], - ) - - # Check for image overlap - img_rect = _entity_overlaps_image( - page, - rect, - page_image_rects[page_index], - ) - - # Determine entity-specific style from the span that - # actually contains this text (not the line's dominant style) - offset_in_line = overlap_start - line["start"] - ent_style = _entity_style_from_spans(line, offset_in_line) - - segments.append( - (line, segment_text, rect, img_rect, ent_style, widget_info) - ) - - if not segments: - continue - - if len(segments) == 1: - # Single-line entity: route widget-backed content through the widget path. - line, _seg_text, rect, img_rect, ent_style, widget_info = segments[0] - if widget_info is not None: - if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_TEXT: - widget_ops.setdefault(page_index, []).append( - { - "widget_xref": widget_info["xref"], - "field_name": widget_info["field_name"], - "widget_info": widget_info, - "entity_text": entity_text, - "logical_token": token, - } - ) - continue - if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE: - op = _build_page_op( - rect, - line, - token, - entity_style=ent_style, - ) - op["widget_xref"] = widget_info["xref"] - op["widget_rect"] = widget_info["rect"] - signature_widget_ops.setdefault(page_index, []).append(op) - continue - - op = _build_page_op( - rect, - line, - token, - is_image=(img_rect is not None), - entity_style=ent_style, - ) - if img_rect is not None: - op["image_rect"] = img_rect - page_ops.setdefault(page_index, []).append(op) - else: - # Multi-line entity: write the token on the widest segment only; blank the others. - widest_idx = max( - range(len(segments)), - key=lambda i: segments[i][2].width, - ) - any_image = any(seg[3] is not None for seg in segments) - - signature_widget = None - if all(seg[5] is not None for seg in segments): - widget_xrefs = {int(seg[5]["xref"]) for seg in segments} - widget_types = {int(seg[5]["field_type"]) for seg in segments} - if len(widget_xrefs) == 1 and widget_types == { - pymupdf.PDF_WIDGET_TYPE_SIGNATURE - }: - signature_widget = segments[0][5] - - for seg_idx, ( - seg_line, - _seg_text, - seg_rect, - seg_img, - seg_style, - seg_widget, - ) in enumerate(segments): - if seg_idx == widest_idx: - op = _build_page_op( - seg_rect, - seg_line, - token, - is_image=(any_image and signature_widget is None), - entity_style=seg_style, - ) - if seg_img is not None and signature_widget is None: - op["image_rect"] = seg_img - else: - op = _build_page_op( - seg_rect, - seg_line, - token, - is_image=( - (seg_img is not None) and signature_widget is None - ), - entity_style=seg_style, - ) - op["text"] = None - op["fontsize"] = None - if seg_img is not None and signature_widget is None: - op["image_rect"] = seg_img - - if signature_widget is not None: - op["widget_xref"] = signature_widget["xref"] - op["widget_rect"] = signature_widget["rect"] - signature_widget_ops.setdefault(page_index, []).append(op) - else: - page_ops.setdefault(page_index, []).append(op) - - return page_ops, widget_ops, signature_widget_ops - - -def _try_image_entity( - page: pymupdf.Page, - entity_text: str, - clip: pymupdf.Rect, - image_rects: list[pymupdf.Rect], -) -> pymupdf.Rect | None: - """When text search fails, check whether the entity region corresponds to - an image in the PDF (e.g. a scanned signature or stamp). If an image - overlaps the *clip* area, return its bounding rect so we can blank it. - - We try to locate the entity text on the page (ignoring clip) first: - if the text is found near an image, that image is the match. - Otherwise we fall back to returning the image with the best spatial - overlap with *clip*. - """ - if not image_rects: - return None - - # Try unclipped text search — the entity might be rendered as real text - # on top of (or near) an image. - text_hits = page.search_for(entity_text) - if text_hits: - for hit_rect in text_hits: - for img_rect in image_rects: - if hit_rect.intersects(img_rect): - return img_rect - - # Fallback: pick the image whose intersection with *clip* is largest - best: pymupdf.Rect | None = None - best_area = 0.0 - for img_rect in image_rects: - if not img_rect.intersects(clip) or img_rect.get_area() <= 0: - continue - intersection = img_rect & clip - area = intersection.get_area() - if area > best_area: - best_area = area - best = img_rect - - return best - - -def _append_cleanup_rect( - cleanup_rects: dict[int, list[pymupdf.Rect]], - page_idx: int, - rect: pymupdf.Rect | tuple[float, float, float, float] | None, -) -> None: - if rect is None: - return - - cleanup_rect = pymupdf.Rect(rect) - if cleanup_rect.get_area() <= 0: - return - cleanup_rects.setdefault(page_idx, []).append(cleanup_rect) - - -def _cleanup_rect_for_page_op(op: dict[str, Any]) -> pymupdf.Rect | None: - if op.get("image_rect") is not None: - cleanup_rect = pymupdf.Rect(op["image_rect"]) - redact_rect = op.get("redact_rect") - if redact_rect is not None: - cleanup_rect.include_rect(pymupdf.Rect(redact_rect)) - return cleanup_rect - - cleanup_source = ( - op.get("redact_rect") or op.get("background_rect") or op.get("canvas_rect") - ) - if cleanup_source is None: - return None - return pymupdf.Rect(cleanup_source) - - -def _cleanup_rect_for_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: - widget_info = op.get("widget_info") or {} - widget_rect = widget_info.get("rect") - if widget_rect is None: - return None - return pymupdf.Rect(widget_rect) - - -def _cleanup_rect_for_signature_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: - widget_rect = op.get("widget_rect") - if widget_rect is not None: - return pymupdf.Rect(widget_rect) - - background_rect = op.get("background_rect") or op.get("canvas_rect") - if background_rect is None: - return None - return pymupdf.Rect(background_rect) - - -def _collect_link_cleanup_rects( - page_ops: dict[int, list[dict]], - widget_ops: dict[int, list[dict]], - signature_widget_ops: dict[int, list[dict]], -) -> dict[int, list[pymupdf.Rect]]: - cleanup_rects: dict[int, list[pymupdf.Rect]] = {} - - for page_idx, ops in page_ops.items(): - for op in ops: - _append_cleanup_rect(cleanup_rects, page_idx, _cleanup_rect_for_page_op(op)) - - for page_idx, ops in widget_ops.items(): - for op in ops: - _append_cleanup_rect( - cleanup_rects, - page_idx, - _cleanup_rect_for_widget_op(op), - ) - - for page_idx, ops in signature_widget_ops.items(): - for op in ops: - _append_cleanup_rect( - cleanup_rects, - page_idx, - _cleanup_rect_for_signature_widget_op(op), - ) - - return cleanup_rects - - -def _remove_overlapping_page_links( - doc: pymupdf.Document, - cleanup_rects: dict[int, list[pymupdf.Rect]], -) -> None: - for page_idx, page_rects in cleanup_rects.items(): - if not page_rects: - continue - - page = doc[page_idx] - for link in list(page.get_links()): - link_rect = link.get("from") - if link_rect is None: - continue - link_rect = pymupdf.Rect(link_rect) - if not any(link_rect.intersects(rect) for rect in page_rects): - continue - try: - page.delete_link(link) - except Exception as exc: - logger.warning( - "Failed to delete PDF link on page=%s rect=%s: %s", - page_idx, - tuple(round(value, 2) for value in link_rect), - exc, - ) - - -def _scrub_pdf_metadata(doc: pymupdf.Document) -> None: - doc.set_metadata( - { - "title": "", - "author": "", - "subject": "", - "keywords": "", - "creator": "", - "producer": "", - "creationDate": "", - "modDate": "", - "trapped": "", - } - ) - - -def _apply_redactions( - doc: pymupdf.Document, - page_ops: dict[int, list[dict]], - widget_ops: dict[int, list[dict]], - signature_widget_ops: dict[int, list[dict]], -) -> None: - _apply_widget_ops(doc, widget_ops) - _apply_signature_widget_ops(doc, signature_widget_ops) - - for page_idx, ops in page_ops.items(): - page = doc[page_idx] - - for op in ops: - page.add_redact_annot( - op["redact_rect"], - text=None, - fill=(1, 1, 1), - cross_out=False, - ) - - page.apply_redactions( - images=pymupdf.PDF_REDACT_IMAGE_NONE, - graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, - text=pymupdf.PDF_REDACT_TEXT_REMOVE, - ) - - for op in ops: - _render_text_op(page, op) - - -def _render_text_op(page: pymupdf.Page, op: dict) -> None: - """Render a single anonymisation tag onto *page*.""" - canvas = pymupdf.Rect(op.get("background_rect") or op["canvas_rect"]) - if not op.get("skip_background_fill"): - page.draw_rect( - canvas, - color=(1, 1, 1), - fill=(1, 1, 1), - width=0, - overlay=True, - ) - - if not op.get("text") or not op.get("fontsize"): - return - - render = op["render_rect"] - line_rect = pymupdf.Rect(op.get("line_rect") or render) - style = op.get("style") or {} - base14_name = _base14_fontname_for_style(style) - font_obj = _get_base14_font(style) - - fontsize = float(op["fontsize"]) - descender = float(style.get("descender") or -0.2) - baseline_y = line_rect.y1 + (descender * fontsize) - baseline_y = min( - max(baseline_y, line_rect.y0 + (fontsize * 0.65)), - line_rect.y1 - 0.1, - ) - - text_width = font_obj.text_length(op["text"], fontsize=fontsize) - x_start = render.x0 + max((render.width - text_width) / 2.0, 0.0) - - try: - page.insert_text( - (x_start, baseline_y), - op["text"], - fontname=base14_name, - fontsize=fontsize, - color=op["text_color"], - overlay=True, - ) - return - except Exception as exc: - logger.debug("insert_text failed for '%s': %s", op["text"], exc) - - try: - tw = pymupdf.TextWriter(page.rect, color=op["text_color"]) - tw.fill_textbox( - render, - op["text"], - font=font_obj, - fontsize=fontsize, - align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), - ) - tw.write_text(page, overlay=True) - return - except Exception as exc: - logger.debug("TextWriter failed for '%s': %s", op["text"], exc) - - try: - page.insert_textbox( - render, - op["text"], - fontname=base14_name, - fontsize=fontsize, - color=op["text_color"], - align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), - overlay=True, - ) - except Exception as exc: - logger.warning( - "All text insertion methods failed for '%s': %s", - op["text"], - exc, - ) - - -@lru_cache(maxsize=1) -def _watermark_font_paths() -> tuple[str | None, str | None]: - search_roots = [ - Path("/workspace"), - Path("/usr/share/fonts"), - Path("/usr/local/share/fonts"), - Path.home() / ".local/share/fonts", - ] - candidates: list[Path] = [] - seen: set[str] = set() - - for root in search_roots: - if not root.exists(): - continue - try: - iterator = root.rglob("*") - except Exception: - continue - for path in iterator: - if not path.is_file() or path.suffix.lower() not in { - ".ttf", - ".otf", - ".ttc", - }: - continue - if "archivo" not in path.name.lower(): - continue - resolved = str(path.resolve()) - if resolved not in seen: - seen.add(resolved) - candidates.append(path) - - candidates = sorted(candidates, key=lambda item: item.name.lower()) - regular_path: str | None = None - bold_path: str | None = None - - for path in candidates: - name = path.name.lower() - if regular_path is None and "bold" not in name and "italic" not in name: - regular_path = str(path) - if bold_path is None and "bold" in name: - bold_path = str(path) - - if regular_path is None and candidates: - regular_path = str(candidates[0]) - if bold_path is None: - bold_path = regular_path - - return regular_path, bold_path - - -@lru_cache(maxsize=1) -def _watermark_font_config() -> dict[str, Any]: - regular_path, bold_path = _watermark_font_paths() - if regular_path: - try: - return { - "text_fontname": "archivo-watermark", - "text_fontfile": regular_path, - "text_font": pymupdf.Font(fontfile=regular_path), - "link_fontname": "archivo-watermark-bold", - "link_fontfile": bold_path or regular_path, - "link_font": pymupdf.Font(fontfile=bold_path or regular_path), - } - except Exception as exc: - logger.warning( - "Could not load Archivo font for PDF watermark, falling back to Helvetica: %s", - exc, - ) - - return { - "text_fontname": "Helvetica", - "text_fontfile": None, - "text_font": pymupdf.Font("Helvetica"), - "link_fontname": "Helvetica-Bold", - "link_fontfile": None, - "link_font": pymupdf.Font("Helvetica-Bold"), - } - - -def _watermark_text_length( - text: str, - *, - font_obj: pymupdf.Font, - fontname: str, - fontsize: float, -) -> float: - try: - return float(font_obj.text_length(text, fontsize=fontsize)) - except Exception: - return float( - pymupdf.get_text_length(text, fontname=fontname, fontsize=fontsize) - ) - - -def _insert_watermark_text( - page: pymupdf.Page, - point: tuple[float, float], - text: str, - *, - fontname: str, - fontsize: float, - color: tuple[float, float, float], - fontfile: str | None = None, -) -> None: - kwargs: dict[str, Any] = { - "fontsize": fontsize, - "fontname": fontname, - "color": color, - "overlay": True, - } - if fontfile: - kwargs["fontfile"] = fontfile - page.insert_text(point, text, **kwargs) - - -def _add_footer_watermark(doc: pymupdf.Document) -> None: - font_config = _watermark_font_config() - prefix_width = _watermark_text_length( - WATERMARK_PREFIX_TEXT, - font_obj=font_config["text_font"], - fontname=font_config["text_fontname"], - fontsize=WATERMARK_FONT_SIZE, - ) - link_width = _watermark_text_length( - WATERMARK_LINK_TEXT, - font_obj=font_config["link_font"], - fontname=font_config["link_fontname"], - fontsize=WATERMARK_FONT_SIZE, - ) - total_width = prefix_width + link_width - - for page_index, page in enumerate(doc): - if page_index % 2 == 0: - x_start = max( - WATERMARK_MARGIN_X, page.rect.width - total_width - WATERMARK_MARGIN_X - ) - else: - x_start = WATERMARK_MARGIN_X - - baseline_y = page.rect.height - WATERMARK_BASELINE_MARGIN - link_x = x_start + prefix_width - - _insert_watermark_text( - page, - (x_start, baseline_y), - WATERMARK_PREFIX_TEXT, - fontname=font_config["text_fontname"], - fontsize=WATERMARK_FONT_SIZE, - color=WATERMARK_TEXT_COLOR, - fontfile=font_config["text_fontfile"], - ) - _insert_watermark_text( - page, - (link_x, baseline_y), - WATERMARK_LINK_TEXT, - fontname=font_config["link_fontname"], - fontsize=WATERMARK_FONT_SIZE, - color=WATERMARK_LINK_COLOR, - fontfile=font_config["link_fontfile"], - ) - - underline_y = min(page.rect.height - 1.0, baseline_y + 1.0) - page.draw_line( - (link_x, underline_y), - (link_x + link_width, underline_y), - color=WATERMARK_LINK_COLOR, - width=0.8, - overlay=True, - ) - page.insert_link( - { - "kind": pymupdf.LINK_URI, - "from": pymupdf.Rect( - link_x, - baseline_y - WATERMARK_FONT_SIZE, - link_x + link_width, - min(page.rect.height, baseline_y + 2.0), - ), - "uri": WATERMARK_URL, - } - ) - - -@register_anonymizer -class PdfAnonymizer(BaseAnonymizer): - extension = "pdf" - - def anonymize( - self, - item: dict, - preds: list[dict], - output_dir: str = ".", - render_context: dict[str, Any] | None = None, - ) -> str: - item_path = Path(item["path"]) - file_path = self.ensure_file(item_path) - - if file_path.suffix.lower() != ".pdf": - raise InvalidDocumentAnonymizer("Only `.pdf` extension is allowed.") - - with pymupdf.open(str(file_path)) as doc: - parsed_doc = pymupdf4llm_document_layout.parse_document( - doc, - filename=str(file_path), - show_progress=False, - force_text=True, - use_ocr=False, - force_ocr=False, - ) - - layout_paragraphs = _build_layout_paragraphs(parsed_doc) - matched_paragraphs = _match_predictions_to_layout(layout_paragraphs, preds) - - _apply_minimal_boundary_merge(matched_paragraphs, render_context) - page_ops, widget_ops, signature_widget_ops = _collect_page_redactions( - doc, - matched_paragraphs, - render_context, - ) - _apply_redactions(doc, page_ops, widget_ops, signature_widget_ops) - cleanup_rects = _collect_link_cleanup_rects( - page_ops, - widget_ops, - signature_widget_ops, - ) - _remove_overlapping_page_links(doc, cleanup_rects) - _scrub_pdf_metadata(doc) - _add_footer_watermark(doc) - - os.makedirs(output_dir, exist_ok=True) - output_path = Path(output_dir) / f"{file_path.stem}.anonymized.pdf" - doc.save(str(output_path), garbage=4, clean=1, deflate=1) - - return str(output_path) diff --git a/aymurai/text/anonymization/pdf/__init__.py b/aymurai/text/anonymization/pdf/__init__.py new file mode 100644 index 0000000..21271aa --- /dev/null +++ b/aymurai/text/anonymization/pdf/__init__.py @@ -0,0 +1,3 @@ +from aymurai.text.anonymization.pdf.anonymizer import PdfAnonymizer + +__all__ = ["PdfAnonymizer"] diff --git a/aymurai/text/anonymization/pdf/anonymizer.py b/aymurai/text/anonymization/pdf/anonymizer.py new file mode 100644 index 0000000..0030c24 --- /dev/null +++ b/aymurai/text/anonymization/pdf/anonymizer.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import pymupdf +import pymupdf.layout # noqa: F401 # activates layout support +from pymupdf4llm.helpers import document_layout as pymupdf4llm_document_layout + +from aymurai.text.anonymization.base import ( + BaseAnonymizer, + InvalidDocumentAnonymizer, + register_anonymizer, +) +from aymurai.text.anonymization.pdf.layout import ( + _apply_minimal_boundary_merge, + _build_layout_paragraphs, + _match_predictions_to_layout, +) +from aymurai.text.anonymization.pdf.ops import ( + _apply_redactions, + _collect_page_redactions, +) +from aymurai.text.anonymization.pdf.sanitize import ( + _collect_link_cleanup_rects, + _sanitize_document, +) +from aymurai.text.anonymization.pdf.watermark import add_pdf_footer_watermark + + +@register_anonymizer +class PdfAnonymizer(BaseAnonymizer): + """ + Anonymize PDF documents by replacing sensitive data with label tokens. + """ + + extension = "pdf" + + def anonymize( + self, + item: dict, + preds: list[dict], + output_dir: str = ".", + render_context: dict[str, Any] | None = None, + ) -> str: + """ + Anonymizes a PDF document using the matched paragraph predictions. + + Args: + item (dict): The item dictionary containing the input PDF path. + preds (list[dict]): The predictions to apply to the document. + output_dir (str, optional): The directory where the anonymized document should be written. Defaults to '.'. + render_context (dict[str, Any] | None, optional): The rendering context used to resolve replacement tokens. Defaults to None. + + Returns: + str: The path to the anonymized PDF output file. + """ + item_path = Path(item["path"]) + file_path = self.ensure_file(item_path) + + if file_path.suffix.lower() != ".pdf": + raise InvalidDocumentAnonymizer("Only `.pdf` extension is allowed.") + + with pymupdf.open(str(file_path)) as doc: + parsed_doc = pymupdf4llm_document_layout.parse_document( + doc, + filename=str(file_path), + show_progress=False, + force_text=True, + use_ocr=False, + force_ocr=False, + ) + + layout_paragraphs = _build_layout_paragraphs(parsed_doc) + matched_paragraphs = _match_predictions_to_layout( + layout_paragraphs, + preds, + ) + + _apply_minimal_boundary_merge(matched_paragraphs, render_context) + page_ops, widget_ops, signature_widget_ops = _collect_page_redactions( + doc, + matched_paragraphs, + render_context, + ) + _apply_redactions(doc, page_ops, widget_ops, signature_widget_ops) + cleanup_rects = _collect_link_cleanup_rects( + page_ops, + widget_ops, + signature_widget_ops, + ) + _sanitize_document(doc, cleanup_rects) + add_pdf_footer_watermark(doc) + + os.makedirs(output_dir, exist_ok=True) + output_path = Path(output_dir) / f"{file_path.stem}.anonymized.pdf" + doc.save(str(output_path), garbage=4, clean=1, deflate=1) + + return str(output_path) diff --git a/aymurai/text/anonymization/pdf/common.py b/aymurai/text/anonymization/pdf/common.py new file mode 100644 index 0000000..91f4292 --- /dev/null +++ b/aymurai/text/anonymization/pdf/common.py @@ -0,0 +1,620 @@ +from __future__ import annotations + +import re +from functools import lru_cache +from typing import Any +from unicodedata import normalize + +import pymupdf + +TEXT_FLAG_ITALIC = 2 +TEXT_FLAG_SERIF = 4 +TEXT_FLAG_MONOSPACED = 8 +TEXT_FLAG_BOLD = 16 +PDF_TAG_MIN_FONT_SIZE = 7.0 +PDF_TAG_FONT_STEP = 0.5 +PDF_TAG_MAX_ABBREVIATION = 3 +PDF_TOKEN_ALIAS_MAP: dict[str, tuple[str, str]] = { + "CORREO_ELECTRONICO": ("CORREO", "MAIL"), + "CUIT_CUIL": ("CUIT", "CUIL"), + "DIRECCION": ("DIREC", "DIR"), + "ESTUDIOS": ("ESTUD", "EDU"), + "MARCA_AUTOMOVIL": ("MARCA_AUTO", "AUTO"), + "NACIONALIDAD": ("NACIONAL", "NAC"), + "NOMBRE_ARCHIVO": ("NOM_ARCH", "ARCH"), + "NUM_ACTUACION": ("NUM_ACT", "ACT"), + "NUM_CAJA_AHORRO": ("NUM_CAJA", "CAJA"), + "NUM_EXPEDIENTE": ("NUM_EXP", "EXPTE"), + "NUM_MATRICULA": ("NUM_MAT", "MAT"), + "PATENTE_DOMINIO": ("PAT_DOM", "PAT"), + "TELEFONO": ("TELEF", "TEL"), + "TEXTO_ANONIMIZAR": ("TEXTO_ANON", "ANON"), + "USUARIX": ("USUAR", "USR"), +} +PDF_TAG_RECT_X_PADDING = 0.5 +PDF_TAG_RECT_Y_PADDING = 0.0 +PDF_TAG_RECT_INSET = 0.5 +PDF_TAG_RECT_GAP_FACTOR = 0.5 +PDF_TAG_RECT_GAP_MIN = 3.0 +PDF_TAG_RECT_GAP_MAX = 8.0 + + +def _line_text(line: dict) -> str: + """ + Builds the plain text content for a parsed PDF line. + + Args: + line (dict): The parsed line metadata being processed. + + Returns: + str: The concatenated text content for the line. + """ + return "".join(span.get("text", "") for span in line.get("spans", [])) + + +def _rect_tuple(value: Any) -> tuple[float, float, float, float]: + """ + Normalizes a rectangle-like value into a coordinate tuple. + + Args: + value (Any): The rectangle-like value to normalize. + + Returns: + tuple[float, float, float, float]: The normalized rectangle coordinates. + """ + if isinstance(value, pymupdf.Rect): + return (float(value.x0), float(value.y0), float(value.x1), float(value.y1)) + if isinstance(value, (list, tuple)) and len(value) == 4: + return (float(value[0]), float(value[1]), float(value[2]), float(value[3])) + raise ValueError(f"Invalid rectangle value: {value}") + + +def _default_style(fallback_size: float = 10.0) -> dict[str, Any]: + """ + Builds a default text style dictionary for PDF rendering helpers. + + Args: + fallback_size (float, optional): The fallback font size used when no style data is available. Defaults to 10.0. + + Returns: + dict[str, Any]: The default style dictionary. + """ + return { + "font": "", + "flags": 0, + "color": (0.0, 0.0, 0.0), + "size": fallback_size, + "ascender": 0.8, + "descender": -0.2, + } + + +def _span_text_weight(span: dict) -> tuple[int, float]: + """ + Computes a sorting weight for a span based on text length and size. + + Args: + span (dict): The span metadata being evaluated. + + Returns: + tuple[int, float]: The text-length and size weight for the span. + """ + text = str(span.get("text") or "").strip() + return (len(text), float(span.get("size") or 0.0)) + + +def _pdf_color_from_span(span: dict) -> tuple[float, float, float]: + """ + Converts a span color value into PDF RGB components. + + Args: + span (dict): The span metadata being evaluated. + + Returns: + tuple[float, float, float]: The PDF RGB color components for the span. + """ + try: + return tuple( + float(value) for value in pymupdf.sRGB_to_pdf(int(span.get("color") or 0)) + ) + except Exception: + return (0.0, 0.0, 0.0) + + +def _line_style(line: dict, fallback_size: float = 10.0) -> dict[str, Any]: + """ + Determines the dominant text style for a parsed PDF line. + + Args: + line (dict): The parsed line metadata being processed. + fallback_size (float, optional): The fallback font size used when no style data is available. Defaults to 10.0. + + Returns: + dict[str, Any]: The dominant style dictionary for the line. + """ + spans = [ + span for span in line.get("spans") or [] if str(span.get("text") or "").strip() + ] + if not spans: + return _default_style(fallback_size) + + dominant = max(spans, key=_span_text_weight) + return { + "font": str(dominant.get("font") or ""), + "flags": int(dominant.get("flags") or 0), + "color": _pdf_color_from_span(dominant), + "size": float(dominant.get("size") or fallback_size), + "ascender": float(dominant.get("ascender") or 0.8), + "descender": float(dominant.get("descender") or -0.2), + } + + +def _build_spans_detail(line: dict) -> tuple[list[dict], int]: + """ + Builds per-span style metadata and character offsets for a line. + + Args: + line (dict): The parsed line metadata being processed. + + Returns: + tuple[list[dict], int]: The span detail list and left-strip offset. + """ + raw_text = normalize("NFKC", _line_text(line)) + strip_offset = len(raw_text) - len(raw_text.lstrip()) + + spans_detail: list[dict] = [] + cursor = 0 + for span in line.get("spans", []): + span_text = normalize("NFKC", span.get("text", "")) + span_start = cursor + cursor += len(span_text) + spans_detail.append( + { + "start": span_start, + "end": cursor, + "style": { + "font": str(span.get("font") or ""), + "flags": int(span.get("flags") or 0), + "color": _pdf_color_from_span(span), + "size": float(span.get("size") or 10.0), + "ascender": float(span.get("ascender") or 0.8), + "descender": float(span.get("descender") or -0.2), + }, + } + ) + return spans_detail, strip_offset + + +def _entity_style_from_spans( + line_entry: dict, + offset_in_stripped_text: int, +) -> dict[str, Any]: + """ + Resolves the style for the entity offset inside a line entry. + + Args: + line_entry (dict): The `line_entry` value used by this helper. + offset_in_stripped_text (int): The entity offset inside the stripped line text. + + Returns: + dict[str, Any]: The resolved style dictionary for the entity offset. + """ + spans_detail = line_entry.get("spans_detail") + if not spans_detail: + return line_entry.get("style") or _default_style() + + strip_offset = line_entry.get("strip_offset", 0) + raw_offset = offset_in_stripped_text + strip_offset + + for span_info in spans_detail: + if span_info["start"] <= raw_offset < span_info["end"]: + return span_info["style"] + + return line_entry.get("style") or _default_style() + + +def _font_size(line: dict, fallback: float = 10.0) -> float: + """ + Calculates a representative font size for a parsed line. + + Args: + line (dict): The parsed line metadata being processed. + fallback (float, optional): The fallback font size to use when the line has no span sizes. Defaults to 10.0. + + Returns: + float: The representative font size for the line. + """ + spans = line.get("spans") or [] + sizes = [float(span.get("size")) for span in spans if span.get("size")] + if not sizes: + return fallback + size = sum(sizes) / len(sizes) + return max(size * 0.9, PDF_TAG_MIN_FONT_SIZE) + + +def _style_flags(style: dict[str, Any]) -> tuple[bool, bool, bool, bool]: + """ + Extracts boolean style flags from a style dictionary. + + Args: + style (dict[str, Any]): The style dictionary being analyzed. + + Returns: + tuple[bool, bool, bool, bool]: The bold, italic, monospace, and serif flags. + """ + flags = int(style.get("flags") or 0) + font_label = str(style.get("font") or "").lower() + + is_bold = bool(flags & TEXT_FLAG_BOLD) or "bold" in font_label + is_italic = bool(flags & TEXT_FLAG_ITALIC) or any( + token in font_label for token in ("italic", "oblique") + ) + is_mono = bool(flags & TEXT_FLAG_MONOSPACED) or any( + token in font_label for token in ("courier", "mono", "console") + ) + is_serif = bool(flags & TEXT_FLAG_SERIF) or any( + token in font_label + for token in ("times", "serif", "georgia", "garamond", "mistral") + ) + return is_bold, is_italic, is_mono, is_serif + + +def _base14_fontname_for_style(style: dict[str, Any]) -> str: + """ + Maps a style dictionary to the closest Base-14 font name. + + Args: + style (dict[str, Any]): The style dictionary being analyzed. + + Returns: + str: The Base-14 font name that best matches the style. + """ + is_bold, is_italic, is_mono, is_serif = _style_flags(style) + + if is_mono: + family = "Courier" + elif is_serif: + family = "Times" + else: + family = "Helvetica" + + variants = { + ("Helvetica", False, False): "Helvetica", + ("Helvetica", True, False): "Helvetica-Bold", + ("Helvetica", False, True): "Helvetica-Oblique", + ("Helvetica", True, True): "Helvetica-BoldOblique", + ("Times", False, False): "Times-Roman", + ("Times", True, False): "Times-Bold", + ("Times", False, True): "Times-Italic", + ("Times", True, True): "Times-BoldItalic", + ("Courier", False, False): "Courier", + ("Courier", True, False): "Courier-Bold", + ("Courier", False, True): "Courier-Oblique", + ("Courier", True, True): "Courier-BoldOblique", + } + return variants[(family, is_bold, is_italic)] + + +def _build_flexible_pattern(text: str) -> str: + """ + Builds a whitespace-tolerant regex pattern for the given text. + + Args: + text (str): The text value being normalized or searched. + + Returns: + str: The whitespace-tolerant regex pattern. + """ + tokens = [re.escape(tok) for tok in re.split(r"\s+", text.strip()) if tok] + return r"\s+".join(tokens) + + +def _find_flexible( + haystack: str, + needle: str, + start: int = 0, +) -> tuple[int, int] | None: + """ + Finds a text span using exact and whitespace-tolerant matching. + + Args: + haystack (str): The source text to search within. + needle (str): The target text to search for. + start (int, optional): The preferred start offset for the search. Defaults to 0. + + Returns: + tuple[int, int] | None: The start and end offsets of the match, if found. + """ + if not needle: + return None + + idx = haystack.find(needle, start) + if idx >= 0: + return idx, idx + len(needle) + + pattern = _build_flexible_pattern(needle) + if not pattern: + return None + + match = re.search(pattern, haystack[start:]) + if match: + return start + match.start(), start + match.end() + + if start > 0: + match = re.search(pattern, haystack) + if match: + return match.start(), match.end() + + return None + + +def _token_parts(token: str) -> tuple[str, str | None]: + """ + Splits a logical token into its base label and numeric suffix. + + Args: + token (str): The logical replacement token being processed. + + Returns: + tuple[str, str | None]: The token base and optional numeric suffix. + """ + match = re.match(r"^(.*?)(?:_(\d+))?$", token) + if not match: + normalized = token.strip() or "ENT" + return normalized, None + + base = match.group(1).strip() or "ENT" + suffix = match.group(2) + return base, suffix + + +def _abbreviate_token(base: str, length: int) -> str: + """ + Builds an abbreviated token label with the requested length. + + Args: + base (str): The token base label to abbreviate or alias. + length (int): The target abbreviation length. + + Returns: + str: The abbreviated token label. + """ + normalized = "".join(char for char in base.upper() if char.isalnum()) + if not normalized: + normalized = "ENT" + return normalized[:length] or normalized[:1] or "E" + + +def _token_aliases(base: str) -> tuple[str, ...]: + """ + Returns configured alias labels for a token base. + + Args: + base (str): The token base label to abbreviate or alias. + + Returns: + tuple[str, ...]: The configured aliases for the token base. + """ + aliases = PDF_TOKEN_ALIAS_MAP.get(base.upper(), ()) + normalized_aliases: list[str] = [] + + for alias in aliases: + normalized = re.sub(r"[^A-Z0-9_]", "", str(alias).upper()) + if ( + normalized + and normalized != base.upper() + and normalized not in normalized_aliases + ): + normalized_aliases.append(normalized) + + return tuple(normalized_aliases) + + +def _build_display_token_candidates(token: str) -> list[str]: + """ + Builds the list of token display candidates to try when rendering. + + Args: + token (str): The logical replacement token being processed. + + Returns: + list[str]: The candidate display tokens to try when rendering. + """ + base, suffix = _token_parts(token.upper()) + candidates: list[str] = [] + + def add(value: str) -> None: + """ + Appends a token display candidate when it has not been added yet. + + Args: + value (str): The rectangle-like value to normalize. + """ + if value and value not in candidates: + candidates.append(value) + + def add_base_variants(label: str) -> None: + """ + Appends the base token variants for the current label candidate. + + Args: + label (str): The label metadata being processed. + """ + if suffix: + add(f"<{label}_{suffix}>") + add(f"<{label}>") + + add_base_variants(base) + + for alias in _token_aliases(base): + add_base_variants(alias) + + abbreviated = _abbreviate_token(base, PDF_TAG_MAX_ABBREVIATION) + add_base_variants(abbreviated) + + return candidates + + +def _iter_font_sizes(start_size: float) -> list[float]: + """ + Builds the descending font sizes to try when fitting a token. + + Args: + start_size (float): The `start_size` value used by this helper. + + Returns: + list[float]: The font sizes to try in descending order. + """ + if start_size <= 0: + return [] + + sizes: list[float] = [start_size] + current = start_size + while current - PDF_TAG_FONT_STEP >= PDF_TAG_MIN_FONT_SIZE - 1e-6: + current = round(current - PDF_TAG_FONT_STEP, 2) + if current not in sizes: + sizes.append(current) + + return sizes + + +def _fit_display_token( + token: str, + rect: pymupdf.Rect, + fontname: str, + base_font_size: float, + font_obj: pymupdf.Font | None = None, +) -> tuple[str | None, float | None]: + """ + Finds a token rendering variant and font size that fit inside a rectangle. + + Args: + token (str): The logical replacement token being processed. + rect (pymupdf.Rect): The rectangle used by the helper. + fontname (str): The font name to use for measurement or rendering. + base_font_size (float): The initial font size to try when fitting text. + font_obj (pymupdf.Font | None, optional): The font object used for measurement. Defaults to None. + + Returns: + tuple[str | None, float | None]: The fitted token text and font size. + """ + if rect.width <= 0 or rect.height <= 0: + return None, None + + available_width = max(rect.width - (2 * PDF_TAG_RECT_INSET), 1.0) + start_size = min(base_font_size, max(rect.height - 1.0, 1.0)) + if start_size < 1.0: + return None, None + + def _measure(text: str, size: float) -> float: + """ + Measures the width of a candidate token at the given font size. + + Args: + text (str): The text value being normalized or searched. + size (float): The font size used for the current measurement. + + Returns: + float: The measured width of the candidate text. + """ + if font_obj is not None: + try: + return font_obj.text_length(text, fontsize=size) + except Exception: + pass + return pymupdf.get_text_length(text, fontname=fontname, fontsize=size) + + for size in _iter_font_sizes(start_size): + for candidate in _build_display_token_candidates(token): + if _measure(candidate, size) <= available_width + 0.1: + return candidate, size + + return None, None + + +_BASE14_FONT_CACHE: dict[str, pymupdf.Font] = {} + + +@lru_cache(maxsize=None) +def _cached_base14_font(name: str) -> pymupdf.Font: + """ + Loads and caches a Base-14 font by name. + + Args: + name (str): The Base-14 font name to load. + + Returns: + pymupdf.Font: The cached Base-14 font object. + """ + return pymupdf.Font(name) + + +def _get_base14_font(style: dict[str, Any]) -> pymupdf.Font: + """ + Returns the cached Base-14 font object for a style dictionary. + + Args: + style (dict[str, Any]): The style dictionary being analyzed. + + Returns: + pymupdf.Font: The cached Base-14 font for the style. + """ + name = _base14_fontname_for_style(style) + font = _BASE14_FONT_CACHE.get(name) + if font is None: + font = _cached_base14_font(name) + _BASE14_FONT_CACHE[name] = font + return font + + +def _rect_vertical_overlap(left: pymupdf.Rect, right: pymupdf.Rect) -> float: + """ + Calculates the vertical overlap ratio between two rectangles. + + Args: + left (pymupdf.Rect): The left rectangle or label to compare. + right (pymupdf.Rect): The right rectangle or label to compare. + + Returns: + float: The vertical overlap ratio between the rectangles. + """ + overlap = max(0.0, min(left.y1, right.y1) - max(left.y0, right.y0)) + min_height = max(min(left.height, right.height), 1e-6) + return overlap / min_height + + +def _group_adjacent_rects( + rects: list[pymupdf.Rect], max_gap: float +) -> list[pymupdf.Rect]: + """ + Merges horizontally adjacent rectangles that belong to the same segment. + + Args: + rects (list[pymupdf.Rect]): The `rects` value used by this helper. + max_gap (float): The `max_gap` value used by this helper. + + Returns: + list[pymupdf.Rect]: The merged rectangle groups. + """ + if not rects: + return [] + + ordered = sorted(rects, key=lambda rect: (rect.y0, rect.x0, rect.x1)) + groups: list[list[pymupdf.Rect]] = [[ordered[0]]] + + for rect in ordered[1:]: + previous = groups[-1][-1] + gap = rect.x0 - previous.x1 + if _rect_vertical_overlap(previous, rect) >= 0.5 and gap <= max_gap: + groups[-1].append(rect) + else: + groups.append([rect]) + + merged_rects: list[pymupdf.Rect] = [] + for group in groups: + merged = pymupdf.Rect(group[0]) + for rect in group[1:]: + merged.include_rect(rect) + merged_rects.append(merged) + + return merged_rects diff --git a/aymurai/text/anonymization/pdf/layout.py b/aymurai/text/anonymization/pdf/layout.py new file mode 100644 index 0000000..50ce529 --- /dev/null +++ b/aymurai/text/anonymization/pdf/layout.py @@ -0,0 +1,510 @@ +from __future__ import annotations + +import re +from copy import deepcopy +from typing import Any +from unicodedata import normalize + +import pymupdf +from jiwer import cer + +from aymurai.logger import get_logger +from aymurai.text.anonymization.alignment import ( + _label_replacement_end as _label_end, +) +from aymurai.text.anonymization.alignment import ( + _label_replacement_start as _label_start, +) +from aymurai.text.anonymization.alignment import ( + resolve_render_token, +) +from aymurai.text.anonymization.pdf.common import ( + PDF_TAG_RECT_GAP_FACTOR, + PDF_TAG_RECT_GAP_MAX, + PDF_TAG_RECT_GAP_MIN, + _build_flexible_pattern, + _build_spans_detail, + _font_size, + _group_adjacent_rects, + _line_style, + _line_text, + _rect_tuple, + _rect_vertical_overlap, +) + +logger = get_logger(__name__) + + +def _same_boundary_candidate(left: dict, right: dict) -> bool: + """ + Checks whether two labels can share a merged boundary token. + + Args: + left (dict): The left rectangle or label to compare. + right (dict): The right rectangle or label to compare. + + Returns: + bool: Whether the labels can share a boundary token. + """ + left_attrs = left.get("attrs") or {} + right_attrs = right.get("attrs") or {} + + if left_attrs.get("aymurai_label") != right_attrs.get("aymurai_label"): + return False + + left_cid = left_attrs.get("canonical_entity_id") + right_cid = right_attrs.get("canonical_entity_id") + if left_cid and right_cid and str(left_cid) != str(right_cid): + return False + + left_text = str(left.get("text") or "").strip() + right_text = str(right.get("text") or "").strip() + return bool(left_text and right_text) + + +def _resolve_token(label: dict, render_context: dict[str, Any] | None) -> str: + """ + Resolves the logical replacement token for a label. + + Args: + label (dict): The label metadata being processed. + render_context (dict[str, Any] | None): The rendering context used to resolve replacement tokens. + + Returns: + str: The logical token that should replace the label. + """ + boundary_token = label.get("_boundary_token") + if boundary_token: + return boundary_token + + token = resolve_render_token(label, render_context) + return token or "ENT" + + +def _apply_minimal_boundary_merge( + paragraphs: list[dict], + render_context: dict[str, Any] | None, +) -> None: + """ + Propagates a shared token across paragraph-boundary label pairs. + + Args: + paragraphs (list[dict]): The paragraph collection being processed. + render_context (dict[str, Any] | None): The rendering context used to resolve replacement tokens. + """ + for left_par, right_par in zip(paragraphs, paragraphs[1:]): + left_doc = left_par.get("document") or "" + right_doc = right_par.get("document") or "" + left_labels = left_par.get("labels") or [] + right_labels = right_par.get("labels") or [] + + if not left_doc or not right_doc or not left_labels or not right_labels: + continue + + left_candidates = [ + label + for label in left_labels + if _label_end(label) >= max(0, len(left_doc) - 2) + ] + right_candidates = [label for label in right_labels if _label_start(label) <= 2] + + if not left_candidates or not right_candidates: + continue + + for left_label in left_candidates: + for right_label in right_candidates: + if not _same_boundary_candidate(left_label, right_label): + continue + + shared_token = _resolve_token(left_label, render_context) + if not shared_token: + shared_token = _resolve_token(right_label, render_context) + if shared_token: + left_label["_boundary_token"] = shared_token + right_label["_boundary_token"] = shared_token + break + + +def _build_layout_paragraphs(parsed_doc: Any) -> list[dict]: + """ + Builds normalized paragraph metadata from the parsed PDF layout. + + Args: + parsed_doc (Any): The parsed PDF layout document. + + Returns: + list[dict]: The normalized layout paragraphs extracted from the parsed document. + """ + chunks = parsed_doc.to_text( + page_chunks=True, + header=True, + footer=True, + show_progress=False, + ) + + paragraphs: list[dict] = [] + layout_index = 0 + for page_idx, (page, chunk) in enumerate(zip(parsed_doc.pages, chunks)): + page_text = chunk.get("text") or "" + page_boxes = chunk.get("page_boxes") or [] + + for box_meta in page_boxes: + box_idx = int(box_meta["index"]) + if box_idx >= len(page.boxes): + continue + + start, stop = box_meta.get("pos", (0, 0)) + box_text = normalize("NFKC", page_text[start:stop]).strip() + if not box_text: + continue + + box = page.boxes[box_idx] + line_entries: list[dict] = [] + line_text_chunks: list[str] = [] + line_cursor = 0 + + for line_idx, line in enumerate(box.textlines or []): + text = normalize("NFKC", _line_text(line)).strip() + if not text: + continue + + if line_text_chunks: + line_text_chunks.append("\n") + line_cursor += 1 + + line_start = line_cursor + line_text_chunks.append(text) + line_cursor += len(text) + line_end = line_cursor + style = _line_style(line) + spans_detail, strip_offset = _build_spans_detail(line) + + line_entries.append( + { + "page_index": page_idx, + "box_index": box_idx, + "line_index": line_idx, + "bbox": _rect_tuple(line["bbox"]), + "font_size": _font_size(line, float(style.get("size") or 10.0)), + "start": line_start, + "end": line_end, + "text": text, + "style": style, + "spans_detail": spans_detail, + "strip_offset": strip_offset, + } + ) + + line_text = "".join(line_text_chunks) + if not line_text: + continue + + paragraphs.append( + { + "plain_text": box_text, + "metadata": { + "layout_index": layout_index, + "page_index": page_idx, + "page_number": page.page_number, + "box_index": box_idx, + "boxclass": box.boxclass, + "box_bbox": ( + float(box.x0), + float(box.y0), + float(box.x1), + float(box.y1), + ), + "line_text": line_text, + "lines": line_entries, + }, + } + ) + layout_index += 1 + + return paragraphs + + +def _match_predictions_to_layout( + layout_paragraphs: list[dict], + preds: list[dict], +) -> list[dict]: + """ + Matches model predictions to the closest layout paragraphs. + + Args: + layout_paragraphs (list[dict]): The `layout_paragraphs` value used by this helper. + preds (list[dict]): The predictions to apply to the document. + + Returns: + list[dict]: The predictions annotated with their matched layout metadata. + """ + if not layout_paragraphs or not preds: + return [] + + available_indices = list(range(len(layout_paragraphs))) + all_indices = list(range(len(layout_paragraphs))) + matched: list[dict] = [] + + normalized_layout_texts = [ + normalize("NFKC", paragraph["plain_text"]).strip() + for paragraph in layout_paragraphs + ] + + for pred_idx, pred in enumerate(preds): + pred_text = normalize("NFKC", str(pred.get("document") or "")).strip() + if not pred_text: + continue + + candidate_pool = available_indices if available_indices else all_indices + exact_idx = next( + ( + idx + for idx in candidate_pool + if normalized_layout_texts[idx] == pred_text + ), + None, + ) + + if exact_idx is None: + exact_idx = min( + candidate_pool, + key=lambda idx: cer(pred_text, normalized_layout_texts[idx]), + ) + + paragraph = deepcopy(layout_paragraphs[exact_idx]) + paragraph["document"] = pred.get("document") or "" + paragraph["labels"] = pred.get("labels") or [] + paragraph["pred_index"] = pred_idx + matched.append(paragraph) + + if exact_idx in available_indices: + available_indices.remove(exact_idx) + + matched.sort(key=lambda paragraph: paragraph["metadata"]["layout_index"]) + return matched + + +def _pick_rect_group_for_segment( + page: pymupdf.Page, + line: dict, + text: str, + line_x_cursor: dict[tuple[int, int, int], float], +) -> pymupdf.Rect: + """ + Chooses the best rectangle group for a text segment on the page. + + Args: + page (pymupdf.Page): The PDF page being processed. + line (dict): The parsed line metadata being processed. + text (str): The text value being normalized or searched. + line_x_cursor (dict[tuple[int, int, int], float]): The per-line cursor used to keep page searches stable. + + Returns: + pymupdf.Rect | None: The chosen rectangle group for the segment, if found. + """ + clip = pymupdf.Rect(line["bbox"]) + rects = [rect for rect in page.search_for(text, clip=clip) if rect.intersects(clip)] + if not rects: + return clip + + max_gap = min( + max(clip.height * PDF_TAG_RECT_GAP_FACTOR, PDF_TAG_RECT_GAP_MIN), + PDF_TAG_RECT_GAP_MAX, + ) + grouped_rects = _group_adjacent_rects(rects, max_gap=max_gap) + + line_key = (line["page_index"], line["box_index"], line["line_index"]) + min_x = line_x_cursor.get(line_key, clip.x0 - 1) + + for rect in grouped_rects: + if rect.x0 >= min_x - 0.5: + line_x_cursor[line_key] = rect.x1 + return rect + + chosen = grouped_rects[0] + line_x_cursor[line_key] = chosen.x1 + return chosen + + +def _normalize_line_chars(spans: list[dict]) -> list[dict[str, Any]]: + """ + Normalizes per-character span data into searchable character entries. + + Args: + spans (list[dict]): The span collection to normalize into character entries. + + Returns: + list[dict[str, Any]]: The normalized character entries for the line. + """ + chars: list[dict[str, Any]] = [] + for span in spans: + for char in span.get("chars") or []: + norm_text = normalize("NFKC", str(char.get("c") or "")) + if not norm_text: + continue + bbox = pymupdf.Rect(char["bbox"]) + for norm_char in norm_text: + chars.append({"char": norm_char, "bbox": bbox}) + return chars + + +def _line_chars_from_page(page: pymupdf.Page, line: dict) -> list[dict[str, Any]]: + """ + Extracts character-level geometry for a parsed line from the page text. + + Args: + page (pymupdf.Page): The PDF page being processed. + line (dict): The parsed line metadata being processed. + + Returns: + list[dict[str, Any]]: The character entries extracted from the page. + """ + clip = pymupdf.Rect(line["bbox"]) + raw = page.get_text("rawdict", clip=clip) + target_text = normalize("NFKC", str(line.get("text") or "")).strip() + + best_chars: list[dict[str, Any]] = [] + best_score: tuple[float, float, float] | None = None + + for block in raw.get("blocks") or []: + if block.get("type", 0) != 0: + continue + for raw_line in block.get("lines") or []: + chars = _normalize_line_chars(raw_line.get("spans") or []) + if not chars: + continue + + candidate_rect = pymupdf.Rect(raw_line["bbox"]) + candidate_text = "".join(entry["char"] for entry in chars).strip() + overlap = ( + _rect_vertical_overlap(candidate_rect, clip) + if candidate_rect.intersects(clip) + else 0.0 + ) + text_score = 0.0 + if target_text or candidate_text: + text_score = ( + 0.0 + if target_text == candidate_text + else cer(target_text, candidate_text) + ) + bbox_score = ( + abs(candidate_rect.x0 - clip.x0) + + abs(candidate_rect.y0 - clip.y0) + + abs(candidate_rect.x1 - clip.x1) + + abs(candidate_rect.y1 - clip.y1) + ) / 100.0 + score = (1.0 - overlap, text_score, bbox_score) + if best_score is None or score < best_score: + best_score = score + best_chars = chars + + return best_chars + + +def _line_chars_text(chars: list[dict[str, Any]]) -> str: + """ + Builds the searchable text for a character entry list. + + Args: + chars (list[dict[str, Any]]): The character entry list being processed. + + Returns: + str: The concatenated character text. + """ + return "".join(str(entry.get("char") or "") for entry in chars) + + +def _find_line_char_span( + chars: list[dict[str, Any]], + text: str, + *, + start: int = 0, + raw_text: str | None = None, +) -> tuple[int, int] | None: + """ + Finds the character span for a text fragment inside a line. + + Args: + chars (list[dict[str, Any]]): The character entry list being processed. + text (str): The text value being normalized or searched. + start (int, optional): The preferred start offset for the search. Defaults to 0. + raw_text (str | None, optional): The raw line text used as a fallback search surface. Defaults to None. + + Returns: + tuple[int, int] | None: The start and end character offsets, if found. + """ + if not chars or not text: + return None + + haystack = raw_text if raw_text is not None else _line_chars_text(chars) + pattern = _build_flexible_pattern(text) + + def _search(offset: int) -> tuple[int, int] | None: + """ + Searches for the candidate span from the provided offset. + + Args: + offset (int): The search offset used by the nested helper. + + Returns: + tuple[int, int] | None: The matching span for the current offset, if found. + """ + exact_idx = haystack.find(text, offset) + flexible_span = None + if pattern: + match = re.search(pattern, haystack[offset:]) + if match is not None: + flexible_span = (offset + match.start(), offset + match.end()) + + if exact_idx < 0: + return flexible_span + exact_span = (exact_idx, exact_idx + len(text)) + if flexible_span is None: + return exact_span + return min(exact_span, flexible_span, key=lambda span: span[0]) + + span = _search(start) + if span is None and start > 0: + span = _search(0) + return span + + +def _rect_from_char_slice( + chars: list[dict[str, Any]], + start: int, + end: int, +) -> pymupdf.Rect | None: + """ + Builds a rectangle covering the requested character slice. + + Args: + chars (list[dict[str, Any]]): The character entry list being processed. + start (int): The preferred start offset for the search. + end (int): The `end` value used by this helper. + + Returns: + pymupdf.Rect | None: The rectangle covering the requested character slice. + """ + if not chars: + return None + + slice_start = max(int(start), 0) + slice_end = min(int(end), len(chars)) + if slice_end <= slice_start: + return None + + segment = chars[slice_start:slice_end] + if not segment: + return None + + boxes = [entry["bbox"] for entry in segment if str(entry["char"]).strip()] + if not boxes: + boxes = [entry["bbox"] for entry in segment] + if not boxes: + return None + + rect = pymupdf.Rect(boxes[0]) + for bbox in boxes[1:]: + rect.include_rect(bbox) + return rect diff --git a/aymurai/text/anonymization/pdf/ops.py b/aymurai/text/anonymization/pdf/ops.py new file mode 100644 index 0000000..bdad1d0 --- /dev/null +++ b/aymurai/text/anonymization/pdf/ops.py @@ -0,0 +1,828 @@ +from __future__ import annotations + +from typing import Any + +import pymupdf + +from aymurai.logger import get_logger +from aymurai.text.anonymization.alignment import ( + _label_replacement_start as _label_start, +) +from aymurai.text.anonymization.alignment import ( + _label_replacement_text as _label_surface_text, +) +from aymurai.text.anonymization.pdf.common import ( + PDF_TAG_RECT_GAP_MAX, + PDF_TAG_RECT_INSET, + PDF_TAG_RECT_X_PADDING, + PDF_TAG_RECT_Y_PADDING, + _base14_fontname_for_style, + _default_style, + _entity_style_from_spans, + _find_flexible, + _fit_display_token, + _get_base14_font, + _group_adjacent_rects, + _rect_vertical_overlap, +) +from aymurai.text.anonymization.pdf.layout import ( + _find_line_char_span, + _line_chars_from_page, + _line_chars_text, + _pick_rect_group_for_segment, + _rect_from_char_slice, + _resolve_token, +) +from aymurai.text.anonymization.pdf.widgets import ( + _apply_widget_ops, + _entity_overlaps_widget, + _page_widget_infos, + _prepare_signature_widget_ops, +) + +logger = get_logger(__name__) + +_IMAGE_OVERLAP_THRESHOLD = 0.3 + + +def _padded_rect(rect: pymupdf.Rect, clip: pymupdf.Rect) -> pymupdf.Rect: + """ + Pads a rectangle within the provided clipping bounds. + + Args: + rect (pymupdf.Rect): The rectangle used by the helper. + clip (pymupdf.Rect): The clipping rectangle to constrain the operation. + + Returns: + pymupdf.Rect: The padded rectangle clipped to the provided bounds. + """ + padded = pymupdf.Rect(rect) + padded.x0 = max(clip.x0, padded.x0 - PDF_TAG_RECT_X_PADDING) + padded.y0 = max(clip.y0, padded.y0 - PDF_TAG_RECT_Y_PADDING) + padded.x1 = min(clip.x1, padded.x1 + PDF_TAG_RECT_X_PADDING) + padded.y1 = min(clip.y1, padded.y1 + PDF_TAG_RECT_Y_PADDING) + return padded + + +def _render_rect(rect: pymupdf.Rect) -> pymupdf.Rect: + """ + Builds the token rendering rectangle from the padded canvas rectangle. + + Args: + rect (pymupdf.Rect): The rectangle used by the helper. + + Returns: + pymupdf.Rect: The rectangle used to render the replacement token. + """ + render_rect = pymupdf.Rect(rect) + inset = min(PDF_TAG_RECT_INSET, max(render_rect.height * 0.1, 0.0)) + render_rect.x0 += inset + render_rect.x1 -= inset + if render_rect.x1 <= render_rect.x0: + render_rect = pymupdf.Rect(rect) + return render_rect + + +def _text_redact_rect(rect: pymupdf.Rect) -> pymupdf.Rect: + """ + Builds the redaction rectangle used to remove original text. + + Args: + rect (pymupdf.Rect): The rectangle used by the helper. + + Returns: + pymupdf.Rect: The rectangle used for text redaction. + """ + redact_rect = pymupdf.Rect(rect) + edge_inset = min(0.25, max(redact_rect.width * 0.01, 0.05)) + if redact_rect.width > (2 * edge_inset): + redact_rect.x0 += edge_inset + redact_rect.x1 -= edge_inset + return redact_rect + + +def _build_page_op( + rect: pymupdf.Rect, + line: dict | None, + token: str, + is_image: bool = False, + entity_style: dict[str, Any] | None = None, +) -> dict[str, Any]: + """ + Builds the rendering operation metadata for a matched page segment. + + Args: + rect (pymupdf.Rect): The rectangle used by the helper. + line (dict | None): The parsed line metadata being processed. + token (str): The logical replacement token being processed. + is_image (bool, optional): Whether the operation is intended for image-backed content. Defaults to False. + entity_style (dict[str, Any] | None, optional): The resolved style dictionary for the entity text. Defaults to None. + + Returns: + dict[str, Any]: The rendering operation metadata for the segment. + """ + line_clip = pymupdf.Rect(line["bbox"]) if line else pymupdf.Rect(rect) + canvas_rect = _padded_rect(rect, line_clip) + render_rect = _render_rect(canvas_rect) + style = entity_style or (line or {}).get("style") or _default_style() + base_font_size = float((line or {}).get("font_size") or style.get("size") or 10.0) + + # Always use Base-14 fonts: they carry correct bold/italic weight and + # contain all glyphs needed for tags (<, >, _, digits, letters). + # Subset font buffers extracted from the PDF lack many of these glyphs. + fontname = _base14_fontname_for_style(style) + font_obj = _get_base14_font(style) + + display_token, fitted_size = _fit_display_token( + token, + render_rect, + fontname, + base_font_size, + font_obj=font_obj, + ) + + if not display_token or fitted_size is None: + logger.warning( + "Could not fit PDF token '%s' inside rect=%s", + token, + tuple(round(value, 2) for value in canvas_rect), + ) + + return { + "redact_rect": _text_redact_rect(rect), + "background_rect": canvas_rect, + "canvas_rect": canvas_rect, + "render_rect": render_rect, + "line_rect": line_clip, + "text": display_token, + "logical_token": token, + "fontname": fontname, + "fontsize": fitted_size, + "text_align": pymupdf.TEXT_ALIGN_LEFT, + "text_color": style.get("color") or (0.0, 0.0, 0.0), + "style": style, + } + + +def _image_rects_for_clip( + page: pymupdf.Page, + clip: pymupdf.Rect, +) -> list[pymupdf.Rect]: + """ + Collects image rectangles that overlap the given page region. + + Args: + page (pymupdf.Page): The PDF page being processed. + clip (pymupdf.Rect): The clipping rectangle to constrain the operation. + + Returns: + list[pymupdf.Rect]: The image rectangles that overlap the clip region. + """ + rects: list[pymupdf.Rect] = [] + for img_info in page.get_image_info(): + bbox = img_info.get("bbox") + if bbox is None: + continue + img_rect = pymupdf.Rect(bbox) + if img_rect.intersects(clip) and img_rect.get_area() > 0: + rects.append(img_rect) + return rects + + +def _entity_overlaps_image( + page: pymupdf.Page, + entity_rect: pymupdf.Rect, + image_rects: list[pymupdf.Rect], +) -> pymupdf.Rect | None: + """ + Checks whether an entity rectangle overlaps a detected image. + + Args: + page (pymupdf.Page): The PDF page being processed. + entity_rect (pymupdf.Rect): The rectangle representing the entity on the page. + image_rects (list[pymupdf.Rect]): The image rectangles available for overlap checks. + + Returns: + pymupdf.Rect | None: The overlapping image rectangle, if one exists. + """ + for img_rect in image_rects: + overlap = _rect_vertical_overlap(entity_rect, img_rect) + if overlap >= _IMAGE_OVERLAP_THRESHOLD and entity_rect.intersects(img_rect): + return img_rect + return None + + +def _collect_page_redactions( + doc: pymupdf.Document, + paragraphs: list[dict], + render_context: dict[str, Any] | None, +) -> dict[int, list[dict]]: + """ + Collects text, widget, and signature redaction operations for a document. + + Args: + doc (pymupdf.Document): The PDF document being processed. + paragraphs (list[dict]): The paragraph collection being processed. + render_context (dict[str, Any] | None): The rendering context used to resolve replacement tokens. + + Returns: + tuple[dict[int, list[dict]], dict[int, list[dict]], dict[int, list[dict]]]: The page, text-widget, and signature-widget operations. + """ + page_ops: dict[int, list[dict]] = {} + widget_ops: dict[int, list[dict]] = {} + signature_widget_ops: dict[int, list[dict]] = {} + line_x_cursor: dict[tuple[int, int, int], float] = {} + line_char_cache: dict[tuple[int, int, int], list[dict[str, Any]]] = {} + line_char_text_cache: dict[tuple[int, int, int], str] = {} + line_char_cursor: dict[tuple[int, int, int], int] = {} + + # Pre-compute image rects and widgets per page + page_image_rects: dict[int, list[pymupdf.Rect]] = {} + page_widgets: dict[int, list[dict[str, Any]]] = {} + + for paragraph in paragraphs: + metadata = paragraph.get("metadata") or {} + lines = metadata.get("lines") or [] + if not lines: + continue + + page_index = int(metadata["page_index"]) + page = doc[page_index] + line_text = metadata.get("line_text") or "" + box_clip = pymupdf.Rect(metadata.get("box_bbox") or page.rect) + document = paragraph.get("document") or "" + labels = sorted(paragraph.get("labels") or [], key=_label_start) + search_cursor = 0 + + # Lazy-load image rects and widget infos for this page + if page_index not in page_image_rects: + page_image_rects[page_index] = _image_rects_for_clip(page, page.rect) + if page_index not in page_widgets: + page_widgets[page_index] = _page_widget_infos(page) + + for label in labels: + entity_text = _label_surface_text(label, document).strip() + if not entity_text: + continue + + token = _resolve_token(label, render_context) + + span = _find_flexible(line_text, entity_text, start=search_cursor) + if span is None: + span = _find_flexible(line_text, entity_text, start=0) + if span is None: + # -- Fallback: direct page search -- + fallback_rects = [ + rect + for rect in page.search_for(entity_text, clip=box_clip) + if rect.intersects(box_clip) + ] + + # Check if this is a widget-backed entity before falling back to images + if fallback_rects: + fallback_widget = _entity_overlaps_widget( + fallback_rects[0], + page_widgets[page_index], + ) + if fallback_widget is not None: + if ( + fallback_widget["field_type"] + == pymupdf.PDF_WIDGET_TYPE_TEXT + ): + widget_ops.setdefault(page_index, []).append( + { + "widget_xref": fallback_widget["xref"], + "field_name": fallback_widget["field_name"], + "widget_info": fallback_widget, + "entity_text": entity_text, + "logical_token": token, + } + ) + continue + if ( + fallback_widget["field_type"] + == pymupdf.PDF_WIDGET_TYPE_SIGNATURE + ): + op = _build_page_op( + fallback_rects[0], + lines[0] if lines else None, + token, + entity_style=fallback_widget.get("style") or None, + ) + op["widget_xref"] = fallback_widget["xref"] + op["widget_rect"] = fallback_widget["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + continue + + # Check if this is an image-based entity + if not fallback_rects: + img_match = _try_image_entity( + page, + entity_text, + box_clip, + page_image_rects[page_index], + ) + if img_match is not None: + op = _build_page_op( + img_match, + lines[0] if lines else None, + token, + is_image=True, + ) + op["image_rect"] = img_match + page_ops.setdefault(page_index, []).append(op) + continue + + if fallback_rects: + grouped_rects = _group_adjacent_rects( + fallback_rects, max_gap=PDF_TAG_RECT_GAP_MAX + ) + fallback_line = lines[0] if lines else None + + # Check if any of these rects overlap an image + for rect in grouped_rects: + img_rect = _entity_overlaps_image( + page, + rect, + page_image_rects[page_index], + ) + op = _build_page_op( + rect, + fallback_line, + token, + is_image=(img_rect is not None), + ) + if img_rect is not None: + op["image_rect"] = img_rect + page_ops.setdefault(page_index, []).append(op) + continue + + logger.warning( + "Could not map label '%s' on page=%s box=%s", + entity_text, + metadata.get("page_number"), + metadata.get("box_index"), + ) + continue + + search_cursor = span[1] + + # Collect line segments this entity spans + segments: list[ + tuple[ + dict, + str, + pymupdf.Rect, + pymupdf.Rect | None, + dict, + dict[str, Any] | None, + ] + ] = [] + for line in lines: + overlap_start = max(span[0], line["start"]) + overlap_end = min(span[1], line["end"]) + if overlap_end <= overlap_start: + continue + + segment_text = line_text[overlap_start:overlap_end].strip() + if not segment_text: + continue + + line_key = ( + line["page_index"], + line["box_index"], + line["line_index"], + ) + line_chars = line_char_cache.get(line_key) + if line_chars is None: + line_chars = _line_chars_from_page(page, line) + line_char_cache[line_key] = line_chars + + line_char_text = line_char_text_cache.get(line_key) + if line_char_text is None: + line_char_text = _line_chars_text(line_chars) + line_char_text_cache[line_key] = line_char_text + + raw_span = _find_line_char_span( + line_chars, + segment_text, + start=line_char_cursor.get(line_key, 0), + raw_text=line_char_text, + ) + rect = None + if raw_span is not None: + line_char_cursor[line_key] = raw_span[1] + rect = _rect_from_char_slice(line_chars, raw_span[0], raw_span[1]) + + if rect is None: + raw_start = ( + overlap_start - line["start"] + int(line.get("strip_offset", 0)) + ) + raw_end = ( + overlap_end - line["start"] + int(line.get("strip_offset", 0)) + ) + rect = _rect_from_char_slice(line_chars, raw_start, raw_end) + if rect is None: + rect = _pick_rect_group_for_segment( + page, + line, + segment_text, + line_x_cursor, + ) + + widget_info = _entity_overlaps_widget( + rect, + page_widgets[page_index], + ) + + # Check for image overlap + img_rect = _entity_overlaps_image( + page, + rect, + page_image_rects[page_index], + ) + + # Determine entity-specific style from the span that + # actually contains this text (not the line's dominant style) + offset_in_line = overlap_start - line["start"] + ent_style = _entity_style_from_spans(line, offset_in_line) + + segments.append( + (line, segment_text, rect, img_rect, ent_style, widget_info) + ) + + if not segments: + continue + + if len(segments) == 1: + # Single-line entity: route widget-backed content through the widget path. + line, _seg_text, rect, img_rect, ent_style, widget_info = segments[0] + if widget_info is not None: + if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_TEXT: + widget_ops.setdefault(page_index, []).append( + { + "widget_xref": widget_info["xref"], + "field_name": widget_info["field_name"], + "widget_info": widget_info, + "entity_text": entity_text, + "logical_token": token, + } + ) + continue + if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE: + op = _build_page_op( + rect, + line, + token, + entity_style=ent_style, + ) + op["widget_xref"] = widget_info["xref"] + op["widget_rect"] = widget_info["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + continue + + op = _build_page_op( + rect, + line, + token, + is_image=(img_rect is not None), + entity_style=ent_style, + ) + if img_rect is not None: + op["image_rect"] = img_rect + page_ops.setdefault(page_index, []).append(op) + else: + # Multi-line entity: write the token on the widest segment only; blank the others. + widest_idx = max( + range(len(segments)), + key=lambda i: segments[i][2].width, + ) + any_image = any(seg[3] is not None for seg in segments) + shared_image_rect = next( + (seg[3] for seg in segments if seg[3] is not None), + None, + ) + + signature_widget = None + if all(seg[5] is not None for seg in segments): + widget_xrefs = {int(seg[5]["xref"]) for seg in segments} + widget_types = {int(seg[5]["field_type"]) for seg in segments} + if len(widget_xrefs) == 1 and widget_types == { + pymupdf.PDF_WIDGET_TYPE_SIGNATURE + }: + signature_widget = segments[0][5] + + for seg_idx, ( + seg_line, + _seg_text, + seg_rect, + seg_img, + seg_style, + seg_widget, + ) in enumerate(segments): + if seg_idx == widest_idx: + op = _build_page_op( + seg_rect, + seg_line, + token, + is_image=(any_image and signature_widget is None), + entity_style=seg_style, + ) + if signature_widget is None and shared_image_rect is not None: + op["image_rect"] = shared_image_rect + else: + op = _build_page_op( + seg_rect, + seg_line, + token, + is_image=( + (seg_img is not None) and signature_widget is None + ), + entity_style=seg_style, + ) + op["text"] = None + op["fontsize"] = None + if seg_img is not None and signature_widget is None: + op["image_rect"] = seg_img + + if signature_widget is not None: + op["widget_xref"] = signature_widget["xref"] + op["widget_rect"] = signature_widget["rect"] + signature_widget_ops.setdefault(page_index, []).append(op) + else: + page_ops.setdefault(page_index, []).append(op) + + return page_ops, widget_ops, signature_widget_ops + + +def _try_image_entity( + page: pymupdf.Page, + entity_text: str, + clip: pymupdf.Rect, + image_rects: list[pymupdf.Rect], +) -> pymupdf.Rect | None: + """ + Finds the best image rectangle for an entity when text search fails. + + Args: + page (pymupdf.Page): The PDF page being processed. + entity_text (str): The entity text being mapped. + clip (pymupdf.Rect): The clipping rectangle to constrain the operation. + image_rects (list[pymupdf.Rect]): The image rectangles available for overlap checks. + + Returns: + pymupdf.Rect | None: The best image rectangle for the entity, if found. + """ + if not image_rects: + return None + + # Try unclipped text search — the entity might be rendered as real text + # on top of (or near) an image. + text_hits = page.search_for(entity_text) + if text_hits: + for hit_rect in text_hits: + for img_rect in image_rects: + if hit_rect.intersects(img_rect): + return img_rect + + # Fallback: pick the image whose intersection with *clip* is largest + best: pymupdf.Rect | None = None + best_area = 0.0 + for img_rect in image_rects: + if not img_rect.intersects(clip) or img_rect.get_area() <= 0: + continue + intersection = img_rect & clip + area = intersection.get_area() + if area > best_area: + best_area = area + best = img_rect + + return best + + +def _render_text_op(page: pymupdf.Page, op: dict) -> None: + """ + Renders a single anonymization token back onto a page. + + Args: + page (pymupdf.Page): The PDF page being processed. + op (dict): The operation dictionary being processed. + """ + canvas = pymupdf.Rect(op.get("background_rect") or op["canvas_rect"]) + if not op.get("skip_background_fill"): + page.draw_rect( + canvas, + color=(1, 1, 1), + fill=(1, 1, 1), + width=0, + overlay=True, + ) + + if not op.get("text") or not op.get("fontsize"): + return + + render = op["render_rect"] + line_rect = pymupdf.Rect(op.get("line_rect") or render) + style = op.get("style") or {} + base14_name = _base14_fontname_for_style(style) + font_obj = _get_base14_font(style) + + fontsize = float(op["fontsize"]) + descender = float(style.get("descender") or -0.2) + baseline_y = line_rect.y1 + (descender * fontsize) + baseline_y = min( + max(baseline_y, line_rect.y0 + (fontsize * 0.65)), + line_rect.y1 - 0.1, + ) + + text_width = font_obj.text_length(op["text"], fontsize=fontsize) + x_start = render.x0 + max((render.width - text_width) / 2.0, 0.0) + + try: + page.insert_text( + (x_start, baseline_y), + op["text"], + fontname=base14_name, + fontsize=fontsize, + color=op["text_color"], + overlay=True, + ) + return + except Exception as exc: + logger.debug("insert_text failed for '%s': %s", op["text"], exc) + + try: + tw = pymupdf.TextWriter(page.rect, color=op["text_color"]) + tw.fill_textbox( + render, + op["text"], + font=font_obj, + fontsize=fontsize, + align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), + ) + tw.write_text(page, overlay=True) + return + except Exception as exc: + logger.debug("TextWriter failed for '%s': %s", op["text"], exc) + + try: + page.insert_textbox( + render, + op["text"], + fontname=base14_name, + fontsize=fontsize, + color=op["text_color"], + align=op.get("text_align", pymupdf.TEXT_ALIGN_CENTER), + overlay=True, + ) + except Exception as exc: + logger.warning( + "All text insertion methods failed for '%s': %s", + op["text"], + exc, + ) + + +def _page_asset_rect(op: dict[str, Any]) -> pymupdf.Rect | None: + """ + Resolves the asset rectangle associated with a page operation. + + Args: + op (dict[str, Any]): The operation dictionary being processed. + + Returns: + pymupdf.Rect | None: The asset rectangle associated with the operation, if any. + """ + asset_rect = op.get("asset_rect") or op.get("image_rect") + if asset_rect is None: + return None + return pymupdf.Rect(asset_rect) + + +def _partition_page_ops( + page_ops: dict[int, list[dict]], +) -> tuple[dict[int, list[dict]], dict[int, list[dict]]]: + """ + Splits page operations into text-only and asset-backed groups. + + Args: + page_ops (dict[int, list[dict]]): The collected page operations grouped by page index. + + Returns: + tuple[dict[int, list[dict]], dict[int, list[dict]]]: The text-only and asset-backed operations. + """ + text_ops: dict[int, list[dict]] = {} + asset_ops: dict[int, list[dict]] = {} + + for page_idx, ops in page_ops.items(): + for op in ops: + if _page_asset_rect(op) is None: + text_ops.setdefault(page_idx, []).append(op) + else: + asset_ops.setdefault(page_idx, []).append(op) + + return text_ops, asset_ops + + +def _apply_text_redactions( + doc: pymupdf.Document, + text_page_ops: dict[int, list[dict]], +) -> None: + """ + Applies text-only redactions and re-renders replacement tokens. + + Args: + doc (pymupdf.Document): The PDF document being processed. + text_page_ops (dict[int, list[dict]]): The text-only page operations grouped by page index. + """ + for page_idx, ops in text_page_ops.items(): + if not ops: + continue + + page = doc[page_idx] + for op in ops: + page.add_redact_annot( + op["redact_rect"], + text=None, + fill=(1, 1, 1), + cross_out=False, + ) + + page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_NONE, + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, + text=pymupdf.PDF_REDACT_TEXT_REMOVE, + ) + + for op in ops: + _render_text_op(page, op) + + +def _apply_asset_redactions( + doc: pymupdf.Document, + asset_page_ops: dict[int, list[dict]], +) -> None: + """ + Applies asset-backed redactions and re-renders replacement tokens. + + Args: + doc (pymupdf.Document): The PDF document being processed. + asset_page_ops (dict[int, list[dict]]): The asset-backed page operations grouped by page index. + """ + for page_idx, ops in asset_page_ops.items(): + if not ops: + continue + + page = doc[page_idx] + graphics_mode = pymupdf.PDF_REDACT_LINE_ART_NONE + + for op in ops: + asset_rect = _page_asset_rect(op) + if asset_rect is None: + continue + + page.add_redact_annot( + asset_rect, + text=None, + fill=(1, 1, 1), + cross_out=False, + ) + graphics_mode = max( + graphics_mode, + int(op.get("graphics_mode") or pymupdf.PDF_REDACT_LINE_ART_NONE), + ) + + page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_REMOVE, + graphics=graphics_mode, + text=pymupdf.PDF_REDACT_TEXT_REMOVE, + ) + + for op in ops: + _render_text_op(page, op) + + +def _apply_redactions( + doc: pymupdf.Document, + page_ops: dict[int, list[dict]], + widget_ops: dict[int, list[dict]], + signature_widget_ops: dict[int, list[dict]], +) -> None: + """ + Applies all collected PDF redactions in the correct order. + + Args: + doc (pymupdf.Document): The PDF document being processed. + page_ops (dict[int, list[dict]]): The collected page operations grouped by page index. + widget_ops (dict[int, list[dict]]): The collected text widget operations grouped by page index. + signature_widget_ops (dict[int, list[dict]]): The collected signature widget operations grouped by page index. + """ + _apply_widget_ops(doc, widget_ops) + _prepare_signature_widget_ops(doc, signature_widget_ops) + + text_page_ops, asset_page_ops = _partition_page_ops(page_ops) + for page_idx, ops in signature_widget_ops.items(): + asset_page_ops.setdefault(page_idx, []).extend(ops) + + _apply_text_redactions(doc, text_page_ops) + _apply_asset_redactions(doc, asset_page_ops) diff --git a/aymurai/text/anonymization/pdf/sanitize.py b/aymurai/text/anonymization/pdf/sanitize.py new file mode 100644 index 0000000..408f32b --- /dev/null +++ b/aymurai/text/anonymization/pdf/sanitize.py @@ -0,0 +1,294 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + +import pymupdf + +from aymurai.logger import get_logger +from aymurai.settings import settings + +logger = get_logger(__name__) + + +def _pdf_metadata_mod_date() -> str: + """ + Builds the PDF metadata modification timestamp in UTC. + + Returns: + str: The PDF-formatted UTC modification timestamp. + """ + timestamp = datetime.now(timezone.utc) + return timestamp.strftime("D:%Y%m%d%H%M%S+00'00'") + + +def _append_cleanup_rect( + cleanup_rects: dict[int, list[pymupdf.Rect]], + page_idx: int, + rect: pymupdf.Rect | tuple[float, float, float, float] | None, +) -> None: + """ + Appends a cleanup rectangle for later document sanitization. + + Args: + cleanup_rects (dict[int, list[pymupdf.Rect]]): The cleanup rectangles grouped by page index. + page_idx (int): The page index associated with the operation. + rect (pymupdf.Rect | tuple[float, float, float, float] | None): The rectangle used by the helper. + """ + if rect is None: + return + + cleanup_rect = pymupdf.Rect(rect) + if cleanup_rect.get_area() <= 0: + return + cleanup_rects.setdefault(page_idx, []).append(cleanup_rect) + + +def _cleanup_rect_for_page_op(op: dict[str, Any]) -> pymupdf.Rect | None: + """ + Builds the cleanup rectangle for a standard page operation. + + Args: + op (dict[str, Any]): The operation dictionary being processed. + + Returns: + pymupdf.Rect | None: The cleanup rectangle for the page operation, if available. + """ + if op.get("image_rect") is not None: + cleanup_rect = pymupdf.Rect(op["image_rect"]) + redact_rect = op.get("redact_rect") + if redact_rect is not None: + cleanup_rect.include_rect(pymupdf.Rect(redact_rect)) + return cleanup_rect + + cleanup_source = ( + op.get("redact_rect") or op.get("background_rect") or op.get("canvas_rect") + ) + if cleanup_source is None: + return None + return pymupdf.Rect(cleanup_source) + + +def _cleanup_rect_for_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: + """ + Builds the cleanup rectangle for a text widget operation. + + Args: + op (dict[str, Any]): The operation dictionary being processed. + + Returns: + pymupdf.Rect | None: The cleanup rectangle for the widget operation, if available. + """ + widget_info = op.get("widget_info") or {} + widget_rect = widget_info.get("rect") + if widget_rect is None: + return None + return pymupdf.Rect(widget_rect) + + +def _cleanup_rect_for_signature_widget_op(op: dict[str, Any]) -> pymupdf.Rect | None: + """ + Builds the cleanup rectangle for a signature widget operation. + + Args: + op (dict[str, Any]): The operation dictionary being processed. + + Returns: + pymupdf.Rect | None: The cleanup rectangle for the signature widget operation, if available. + """ + widget_rect = op.get("widget_rect") + if widget_rect is not None: + return pymupdf.Rect(widget_rect) + + background_rect = op.get("background_rect") or op.get("canvas_rect") + if background_rect is None: + return None + return pymupdf.Rect(background_rect) + + +def _collect_link_cleanup_rects( + page_ops: dict[int, list[dict]], + widget_ops: dict[int, list[dict]], + signature_widget_ops: dict[int, list[dict]], +) -> dict[int, list[pymupdf.Rect]]: + """ + Collects cleanup rectangles used to prune overlapping links. + + Args: + page_ops (dict[int, list[dict]]): The collected page operations grouped by page index. + widget_ops (dict[int, list[dict]]): The collected text widget operations grouped by page index. + signature_widget_ops (dict[int, list[dict]]): The collected signature widget operations grouped by page index. + + Returns: + dict[int, list[pymupdf.Rect]]: The cleanup rectangles grouped by page index. + """ + cleanup_rects: dict[int, list[pymupdf.Rect]] = {} + + for page_idx, ops in page_ops.items(): + for op in ops: + _append_cleanup_rect(cleanup_rects, page_idx, _cleanup_rect_for_page_op(op)) + + for page_idx, ops in widget_ops.items(): + for op in ops: + _append_cleanup_rect( + cleanup_rects, + page_idx, + _cleanup_rect_for_widget_op(op), + ) + + for page_idx, ops in signature_widget_ops.items(): + for op in ops: + _append_cleanup_rect( + cleanup_rects, + page_idx, + _cleanup_rect_for_signature_widget_op(op), + ) + + return cleanup_rects + + +def _remove_overlapping_page_links( + doc: pymupdf.Document, + cleanup_rects: dict[int, list[pymupdf.Rect]], +) -> None: + """ + Deletes page links that overlap anonymized regions. + + Args: + doc (pymupdf.Document): The PDF document being processed. + cleanup_rects (dict[int, list[pymupdf.Rect]]): The cleanup rectangles grouped by page index. + """ + for page_idx, page_rects in cleanup_rects.items(): + if not page_rects: + continue + + page = doc[page_idx] + for link in list(page.get_links()): + link_rect = link.get("from") + if link_rect is None: + continue + link_rect = pymupdf.Rect(link_rect) + if not any(link_rect.intersects(rect) for rect in page_rects): + continue + try: + page.delete_link(link) + except Exception as exc: + logger.warning( + "Failed to delete PDF link on page=%s rect=%s: %s", + page_idx, + tuple(round(value, 2) for value in link_rect), + exc, + ) + + +def _remove_remaining_annotations(doc: pymupdf.Document) -> None: + """ + Deletes residual page annotations after sanitization. + + Args: + doc (pymupdf.Document): The PDF document being processed. + """ + for page_idx, page in enumerate(doc): + for annot in list(page.annots() or []): + try: + page.delete_annot(annot) + except Exception as exc: + logger.warning( + "Failed to delete residual PDF annotation on page=%s: %s", + page_idx, + exc, + ) + + +def _clear_standard_metadata(doc: pymupdf.Document) -> None: + """ + Clears the standard PDF metadata fields on a document. + + Args: + doc (pymupdf.Document): The PDF document being processed. + """ + doc.set_metadata( + { + "title": "", + "author": "", + "subject": "", + "keywords": "", + "creator": "", + "producer": "", + "creationDate": "", + "modDate": "", + "trapped": "", + } + ) + + +def _apply_aymurai_metadata(doc: pymupdf.Document) -> None: + """ + Applies the configured AymurAI tooling metadata fields to the PDF document. + + Args: + doc (pymupdf.Document): The PDF document being processed. + """ + metadata = dict(doc.metadata or {}) + metadata.update( + { + "title": metadata.get("title") or "", + "author": "", + "subject": metadata.get("subject") or "", + "keywords": metadata.get("keywords") or "", + "creator": settings.ANONYMIZATION_METADATA_CREATOR, + "producer": settings.ANONYMIZATION_METADATA_PRODUCER, + "creationDate": metadata.get("creationDate") or "", + "modDate": _pdf_metadata_mod_date(), + "trapped": metadata.get("trapped") or "", + } + ) + doc.set_metadata(metadata) + + +def _sanitize_document( + doc: pymupdf.Document, + cleanup_rects: dict[int, list[pymupdf.Rect]], +) -> None: + """ + Sanitizes document-level PDF metadata, attachments, and annotations. + + Args: + doc (pymupdf.Document): The PDF document being processed. + cleanup_rects (dict[int, list[pymupdf.Rect]]): The cleanup rectangles grouped by page index. + """ + _remove_overlapping_page_links(doc, cleanup_rects) + doc.scrub( + metadata=True, + xml_metadata=True, + javascript=True, + attached_files=True, + embedded_files=True, + thumbnails=True, + reset_responses=True, + hidden_text=True, + clean_pages=True, + remove_links=False, + reset_fields=False, + redactions=False, + ) + _remove_remaining_annotations(doc) + _clear_standard_metadata(doc) + _apply_aymurai_metadata(doc) + + get_xml_metadata = getattr(doc, "get_xml_metadata", None) + del_xml_metadata = getattr(doc, "del_xml_metadata", None) + if callable(get_xml_metadata) and callable(del_xml_metadata): + try: + xml_metadata = get_xml_metadata() + except Exception as exc: + logger.warning("Failed to read PDF XML metadata after scrub: %s", exc) + else: + if xml_metadata: + try: + del_xml_metadata() + except Exception as exc: + logger.warning( + "Failed to delete residual PDF XML metadata: %s", + exc, + ) diff --git a/aymurai/text/anonymization/pdf/watermark.py b/aymurai/text/anonymization/pdf/watermark.py new file mode 100644 index 0000000..50b2b2e --- /dev/null +++ b/aymurai/text/anonymization/pdf/watermark.py @@ -0,0 +1,315 @@ +from __future__ import annotations + +import os +from functools import lru_cache +from pathlib import Path +from typing import Any + +import pymupdf + +from aymurai.logger import get_logger +from aymurai.settings import settings + +logger = get_logger(__name__) + +WATERMARK_PREFIX_TEXT = "Documento anonimizado por " +WATERMARK_LINK_TEXT = "AymurAI" +WATERMARK_TEXT = f"{WATERMARK_PREFIX_TEXT}{WATERMARK_LINK_TEXT}" +WATERMARK_URL = "https://www.aymurai.info/" +WATERMARK_FONT_SIZE = 10.0 +WATERMARK_MARGIN_X = 24.0 +WATERMARK_BASELINE_MARGIN = 12.0 +WATERMARK_TEXT_COLOR = tuple(channel / 255 for channel in (192, 192, 192)) +WATERMARK_LINK_COLOR = tuple(channel / 255 for channel in (115, 190, 250)) + + +def _candidate_font_paths() -> tuple[list[Path], list[Path]]: + """ + Builds the ordered list of candidate font paths for the PDF watermark. + + Returns: + tuple[list[Path], list[Path]]: The regular and bold watermark font candidates. + """ + override_regular = ( + os.getenv("PDF_WATERMARK_FONT_REGULAR") or settings.PDF_WATERMARK_FONT_REGULAR + ) + override_bold = ( + os.getenv("PDF_WATERMARK_FONT_BOLD") or settings.PDF_WATERMARK_FONT_BOLD + ) + + regular_candidates: list[Path] = [] + bold_candidates: list[Path] = [] + + if override_regular: + regular_candidates.append(Path(override_regular).expanduser()) + if override_bold: + bold_candidates.append(Path(override_bold).expanduser()) + + resource_roots: list[Path] = [] + resources_base = Path(settings.RESOURCES_BASEPATH) + if resources_base.is_absolute(): + resource_roots.append(resources_base) + else: + resource_roots.append((Path("/workspace") / resources_base).resolve()) + resource_roots.append(resources_base) + + font_roots: list[Path] = [] + for root in resource_roots: + font_roots.extend([root / "fonts", root / "fonts" / "archivo"]) + + for root in font_roots: + regular_candidates.extend( + [ + root / "Archivo-Regular.ttf", + root / "Archivo-Regular.otf", + root / "Archivo[wdth,wght].ttf", + root / "Archivo-VariableFont_wdth,wght.ttf", + ] + ) + bold_candidates.extend( + [ + root / "Archivo-Bold.ttf", + root / "Archivo-Bold.otf", + root / "Archivo-BoldItalic.ttf", + root / "Archivo-VariableFont_wdth,wght.ttf", + root / "Archivo[wdth,wght].ttf", + ] + ) + + system_roots = [ + Path("/usr/share/fonts/truetype/archivo"), + Path("/usr/share/fonts/opentype/archivo"), + Path("/usr/local/share/fonts/archivo"), + Path.home() / ".local/share/fonts", + Path.home() / ".local/share/fonts/archivo", + ] + for root in system_roots: + regular_candidates.extend( + [ + root / "Archivo-Regular.ttf", + root / "Archivo-Regular.otf", + root / "Archivo[wdth,wght].ttf", + root / "Archivo-VariableFont_wdth,wght.ttf", + ] + ) + bold_candidates.extend( + [ + root / "Archivo-Bold.ttf", + root / "Archivo-Bold.otf", + root / "Archivo-BoldItalic.ttf", + root / "Archivo-VariableFont_wdth,wght.ttf", + root / "Archivo[wdth,wght].ttf", + ] + ) + + return regular_candidates, bold_candidates + + +def _first_existing_path(paths: list[Path]) -> str | None: + """ + Returns the first existing file path from the provided candidates. + + Args: + paths (list[Path]): The candidate paths to inspect. + + Returns: + str | None: The first existing file path, if one is found. + """ + seen: set[str] = set() + for path in paths: + expanded = path.expanduser() + resolved = str(expanded) + if resolved in seen: + continue + seen.add(resolved) + if expanded.exists() and expanded.is_file(): + return str(expanded) + return None + + +@lru_cache(maxsize=1) +def _watermark_font_paths() -> tuple[str | None, str | None]: + """ + Resolves the font paths used by the PDF watermark. + + Returns: + tuple[str | None, str | None]: The resolved regular and bold watermark font paths. + """ + regular_candidates, bold_candidates = _candidate_font_paths() + regular_path = _first_existing_path(regular_candidates) + bold_path = _first_existing_path(bold_candidates) + if regular_path is None and bold_path is not None: + regular_path = bold_path + if bold_path is None: + bold_path = regular_path + return regular_path, bold_path + + +@lru_cache(maxsize=1) +def _watermark_font_config() -> dict[str, Any]: + """ + Builds the font configuration used to render the PDF watermark. + + Returns: + dict[str, Any]: The watermark font configuration dictionary. + """ + regular_path, bold_path = _watermark_font_paths() + if regular_path: + try: + return { + "text_fontname": "archivo-watermark", + "text_fontfile": regular_path, + "text_font": pymupdf.Font(fontfile=regular_path), + "link_fontname": "archivo-watermark-bold", + "link_fontfile": bold_path or regular_path, + "link_font": pymupdf.Font(fontfile=bold_path or regular_path), + } + except Exception as exc: + logger.warning( + "Could not load Archivo font for PDF watermark, falling back to Base-14 fonts: %s", + exc, + ) + + return { + "text_fontname": "Helvetica", + "text_fontfile": None, + "text_font": pymupdf.Font("Helvetica"), + "link_fontname": "Helvetica-Bold", + "link_fontfile": None, + "link_font": pymupdf.Font("Helvetica-Bold"), + } + + +def _watermark_text_length( + text: str, + *, + font_obj: pymupdf.Font, + fontname: str, + fontsize: float, +) -> float: + """ + Measures the rendered width of watermark text. + + Args: + text (str): The text value being normalized or searched. + font_obj (pymupdf.Font): The font object used for measurement. + fontname (str): The font name to use for measurement or rendering. + fontsize (float): The font size used for measurement or rendering. + + Returns: + float: The rendered width of the watermark text. + """ + try: + return float(font_obj.text_length(text, fontsize=fontsize)) + except Exception: + return float( + pymupdf.get_text_length(text, fontname=fontname, fontsize=fontsize) + ) + + +def _insert_watermark_text( + page: pymupdf.Page, + point: tuple[float, float], + text: str, + *, + fontname: str, + fontsize: float, + color: tuple[float, float, float], + fontfile: str | None = None, +) -> None: + """ + Inserts watermark text onto a page using the resolved font settings. + + Args: + page (pymupdf.Page): The PDF page being processed. + point (tuple[float, float]): The insertion point on the page. + text (str): The text value being normalized or searched. + fontname (str): The font name to use for measurement or rendering. + fontsize (float): The font size used for measurement or rendering. + color (tuple[float, float, float]): The PDF RGB color used to render the text. + fontfile (str | None, optional): The optional font file path to embed for rendering. Defaults to None. + """ + kwargs: dict[str, Any] = { + "fontsize": fontsize, + "fontname": fontname, + "color": color, + "overlay": True, + } + if fontfile: + kwargs["fontfile"] = fontfile + page.insert_text(point, text, **kwargs) + + +def add_pdf_footer_watermark(doc: pymupdf.Document) -> None: + """ + Adds the anonymization watermark to the footer of each PDF page. + + Args: + doc (pymupdf.Document): The PDF document being processed. + """ + font_config = _watermark_font_config() + prefix_width = _watermark_text_length( + WATERMARK_PREFIX_TEXT, + font_obj=font_config["text_font"], + fontname=font_config["text_fontname"], + fontsize=WATERMARK_FONT_SIZE, + ) + link_width = _watermark_text_length( + WATERMARK_LINK_TEXT, + font_obj=font_config["link_font"], + fontname=font_config["link_fontname"], + fontsize=WATERMARK_FONT_SIZE, + ) + total_width = prefix_width + link_width + + for page_index, page in enumerate(doc): + if page_index % 2 == 0: + x_start = max( + WATERMARK_MARGIN_X, + page.rect.width - total_width - WATERMARK_MARGIN_X, + ) + else: + x_start = WATERMARK_MARGIN_X + + baseline_y = page.rect.height - WATERMARK_BASELINE_MARGIN + link_x = x_start + prefix_width + + _insert_watermark_text( + page, + (x_start, baseline_y), + WATERMARK_PREFIX_TEXT, + fontname=font_config["text_fontname"], + fontsize=WATERMARK_FONT_SIZE, + color=WATERMARK_TEXT_COLOR, + fontfile=font_config["text_fontfile"], + ) + _insert_watermark_text( + page, + (link_x, baseline_y), + WATERMARK_LINK_TEXT, + fontname=font_config["link_fontname"], + fontsize=WATERMARK_FONT_SIZE, + color=WATERMARK_LINK_COLOR, + fontfile=font_config["link_fontfile"], + ) + + underline_y = min(page.rect.height - 1.0, baseline_y + 1.0) + page.draw_line( + (link_x, underline_y), + (link_x + link_width, underline_y), + color=WATERMARK_LINK_COLOR, + width=0.8, + overlay=True, + ) + page.insert_link( + { + "kind": pymupdf.LINK_URI, + "from": pymupdf.Rect( + link_x, + baseline_y - WATERMARK_FONT_SIZE, + link_x + link_width, + min(page.rect.height, baseline_y + 2.0), + ), + "uri": WATERMARK_URL, + } + ) diff --git a/aymurai/text/anonymization/pdf/widgets.py b/aymurai/text/anonymization/pdf/widgets.py new file mode 100644 index 0000000..3ea97d7 --- /dev/null +++ b/aymurai/text/anonymization/pdf/widgets.py @@ -0,0 +1,323 @@ +from __future__ import annotations + +from typing import Any + +import pymupdf + +from aymurai.logger import get_logger +from aymurai.text.anonymization.pdf.common import ( + _build_display_token_candidates, + _default_style, + _find_flexible, + _get_base14_font, +) + +logger = get_logger(__name__) + + +def _signature_background_rect( + op: dict[str, Any], + widget_rect: pymupdf.Rect, +) -> pymupdf.Rect: + """ + Builds the background rectangle used for a signature replacement. + + Args: + op (dict[str, Any]): The operation dictionary being processed. + widget_rect (pymupdf.Rect): The rectangle occupied by the widget. + + Returns: + pymupdf.Rect: The background rectangle for the signature replacement. + """ + background = pymupdf.Rect( + op.get("line_rect") or op.get("canvas_rect") or widget_rect + ) + canvas_rect = op.get("canvas_rect") + if canvas_rect is not None: + background.include_rect(pymupdf.Rect(canvas_rect)) + + pad_x = max(background.height * 0.75, 2.0) + pad_y = max(background.height * 0.25, 0.75) + widget_clip = pymupdf.Rect(widget_rect) + + background.x0 = max(widget_clip.x0, background.x0 - pad_x) + background.y0 = max(widget_clip.y0, background.y0 - pad_y) + background.x1 = min(widget_clip.x1, background.x1 + pad_x) + background.y1 = min(widget_clip.y1, background.y1 + pad_y) + return background + + +def _widget_text_color(widget: pymupdf.Widget) -> tuple[float, float, float]: + """ + Extracts the text color configured on a PDF widget. + + Args: + widget (pymupdf.Widget): The widget being processed. + + Returns: + tuple[float, float, float]: The widget text color in PDF RGB components. + """ + values = list(widget.text_color or []) + if not values: + return (0.0, 0.0, 0.0) + if len(values) == 1: + shade = float(values[0]) + return (shade, shade, shade) + if len(values) >= 3: + return tuple(float(value) for value in values[:3]) + return (0.0, 0.0, 0.0) + + +def _style_from_widget(widget: pymupdf.Widget) -> dict[str, Any]: + """ + Builds a text style dictionary from a widget definition. + + Args: + widget (pymupdf.Widget): The widget being processed. + + Returns: + dict[str, Any]: The style dictionary derived from the widget. + """ + return { + "font": str(widget.text_font or ""), + "flags": 0, + "color": _widget_text_color(widget), + "size": float(widget.text_fontsize or 10.0), + "ascender": 0.8, + "descender": -0.2, + } + + +def _page_widget_infos(page: pymupdf.Page) -> list[dict[str, Any]]: + """ + Collects text and signature widget metadata for a page. + + Args: + page (pymupdf.Page): The PDF page being processed. + + Returns: + list[dict[str, Any]]: The widget metadata collected for the page. + """ + infos: list[dict[str, Any]] = [] + for widget in page.widgets() or []: + if widget.field_type not in ( + pymupdf.PDF_WIDGET_TYPE_TEXT, + pymupdf.PDF_WIDGET_TYPE_SIGNATURE, + ): + continue + infos.append( + { + "xref": int(widget.xref), + "field_type": int(widget.field_type), + "field_name": str(widget.field_name or ""), + "field_value": str(widget.field_value or ""), + "rect": pymupdf.Rect(widget.rect), + "style": _style_from_widget(widget), + } + ) + return infos + + +def _entity_overlaps_widget( + entity_rect: pymupdf.Rect, + widget_infos: list[dict[str, Any]], +) -> dict[str, Any] | None: + """ + Finds the widget that most overlaps the given entity rectangle. + + Args: + entity_rect (pymupdf.Rect): The rectangle representing the entity on the page. + widget_infos (list[dict[str, Any]]): The widget metadata available for overlap checks. + + Returns: + dict[str, Any] | None: The best overlapping widget info, if one exists. + """ + best_widget: dict[str, Any] | None = None + best_area = 0.0 + for widget_info in widget_infos: + widget_rect = widget_info["rect"] + if not entity_rect.intersects(widget_rect): + continue + area = (entity_rect & widget_rect).get_area() + if area > best_area: + best_area = area + best_widget = widget_info + return best_widget + + +def _fit_widget_token( + widget_info: dict[str, Any], + current_text: str, + entity_span: tuple[int, int], + token: str, +) -> str: + """ + Finds a token variant that fits inside a widget value. + + Args: + widget_info (dict[str, Any]): The widget metadata being processed. + current_text (str): The current widget text value. + entity_span (tuple[int, int]): The span of the entity inside the widget text. + token (str): The logical replacement token being processed. + + Returns: + str: The token variant that fits in the widget value. + """ + style = widget_info.get("style") or _default_style() + rect = pymupdf.Rect(widget_info["rect"]) + font_obj = _get_base14_font(style) + max_width = max(rect.width - 1.0, 1.0) + + prefix = current_text[: entity_span[0]] + suffix = current_text[entity_span[1] :] + + for candidate in _build_display_token_candidates(token): + candidate_text = f"{prefix}{candidate}{suffix}" + if ( + font_obj.text_length( + candidate_text, fontsize=float(style.get("size") or 10.0) + ) + <= max_width + 0.1 + ): + return candidate + + candidates = _build_display_token_candidates(token) + return candidates[0] if candidates else f"<{token}>" + + +def _apply_widget_ops( + doc: pymupdf.Document, + widget_ops: dict[int, list[dict]], +) -> None: + """ + Applies collected replacements to editable text widgets. + + Args: + doc (pymupdf.Document): The PDF document being processed. + widget_ops (dict[int, list[dict]]): The collected text widget operations grouped by page index. + """ + for page_idx, ops in widget_ops.items(): + if not ops: + continue + + page = doc[page_idx] + widgets = { + int(widget.xref): widget + for widget in (page.widgets() or []) + if widget.field_type == pymupdf.PDF_WIDGET_TYPE_TEXT + } + grouped: dict[int, list[dict]] = {} + for op in ops: + grouped.setdefault(int(op["widget_xref"]), []).append(op) + + for widget_xref, replacements in grouped.items(): + widget = widgets.get(widget_xref) + if widget is None: + logger.warning( + "Could not resolve PDF widget xref=%s on page=%s", + widget_xref, + page_idx, + ) + continue + + current_text = str(widget.field_value or "") + if not current_text: + continue + + search_cursor = 0 + changed = False + for replacement in replacements: + entity_text = replacement["entity_text"] + span = _find_flexible(current_text, entity_text, start=search_cursor) + if span is None: + span = _find_flexible(current_text, entity_text, start=0) + if span is None: + logger.warning( + "Could not map widget label '%s' in widget '%s' on page=%s", + entity_text, + replacement.get("field_name") or widget.field_name, + page_idx, + ) + continue + + token_text = _fit_widget_token( + replacement["widget_info"], + current_text, + span, + replacement["logical_token"], + ) + current_text = ( + f"{current_text[: span[0]]}{token_text}{current_text[span[1] :]}" + ) + search_cursor = span[0] + len(token_text) + changed = True + + if not changed: + continue + + try: + widget.field_value = current_text + widget.update() + except Exception as exc: + logger.warning( + "Failed to update PDF widget '%s' on page=%s: %s", + widget.field_name, + page_idx, + exc, + ) + + +def _prepare_signature_widget_ops( + doc: pymupdf.Document, + signature_widget_ops: dict[int, list[dict]], +) -> None: + """ + Deletes signature widgets and prepares their replacement operations. + + Args: + doc (pymupdf.Document): The PDF document being processed. + signature_widget_ops (dict[int, list[dict]]): The collected signature widget operations grouped by page index. + """ + for page_idx, ops in signature_widget_ops.items(): + if not ops: + continue + + page = doc[page_idx] + widgets = { + int(widget.xref): widget + for widget in (page.widgets() or []) + if widget.field_type == pymupdf.PDF_WIDGET_TYPE_SIGNATURE + } + grouped: dict[int, list[dict]] = {} + for op in ops: + grouped.setdefault(int(op["widget_xref"]), []).append(op) + + for widget_xref, widget_group_ops in grouped.items(): + widget = widgets.get(widget_xref) + widget_rect = pymupdf.Rect( + widget_group_ops[0].get("widget_rect") or (0, 0, 0, 0) + ) + + if widget is not None: + widget_rect = pymupdf.Rect(widget.rect) + try: + page.delete_widget(widget) + except Exception as exc: + logger.warning( + "Failed to delete signature widget xref=%s on page=%s: %s", + widget_xref, + page_idx, + exc, + ) + else: + logger.warning( + "Could not resolve PDF signature widget xref=%s on page=%s", + widget_xref, + page_idx, + ) + + for op in widget_group_ops: + op["widget_rect"] = pymupdf.Rect(widget_rect) + op["asset_rect"] = pymupdf.Rect(widget_rect) + op["graphics_mode"] = pymupdf.PDF_REDACT_LINE_ART_REMOVE_IF_COVERED + op["background_rect"] = _signature_background_rect(op, widget_rect) From 783a68f0bbc68aa31c7260a423b9715ed05db60f Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 17:06:57 +0000 Subject: [PATCH 24/28] =?UTF-8?q?=E2=9C=85=20Add=20integration=20tests=20f?= =?UTF-8?q?or=20PDF=20and=20DOCX=20anonymizers,=20including=20metadata=20s?= =?UTF-8?q?crubbing=20and=20link=20preservation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 264 ++++++++++++++++++ 1 file changed, 264 insertions(+) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 2dd50d4..321f32f 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -1,14 +1,244 @@ +import base64 import json +import re import subprocess +from datetime import datetime, timedelta, timezone +from pathlib import Path from unittest.mock import MagicMock, patch +import pymupdf import pytest +from docx import Document from aymurai.database.schema import AnonymizationParagraph from aymurai.database.utils import text_to_uuid +from aymurai.text.anonymization import DocxAnonymizer, PdfAnonymizer, get_anonymizer from tests.api.conftest import build_label from tests.api.routers.conftest import build_mock_pipeline +PNG_1X1 = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+a6R8AAAAASUVORK5CYII=" +) +WATERMARK_URL = "https://www.aymurai.info/" + + +def _write_pdf(path: Path, configure) -> Path: + doc = pymupdf.open() + page = doc.new_page() + configure(doc, page) + doc.save(path) + doc.close() + return path + + +def _label_dict(text: str, label: str = "PER", **attrs) -> dict: + payload = build_label(label, text).model_dump(mode="json") + payload["attrs"].update(attrs) + return payload + + +def _run_pdf_anonymizer( + tmp_path: Path, + source_path: Path, + document: str, + labels: list[dict], +) -> Path: + output_dir = tmp_path / "out" + output_dir.mkdir(exist_ok=True) + output_path = PdfAnonymizer().anonymize( + {"path": str(source_path)}, + [{"document": document, "labels": labels}], + str(output_dir), + ) + return Path(output_path) + + +@pytest.mark.integration +def test_anonymization_package_exports_and_registry_are_stable(): + assert PdfAnonymizer.__name__ == "PdfAnonymizer" + assert DocxAnonymizer.__name__ == "DocxAnonymizer" + assert isinstance(get_anonymizer("pdf"), PdfAnonymizer) + assert isinstance(get_anonymizer("docx"), DocxAnonymizer) + + +@pytest.mark.integration +def test_pdf_anonymizer_falls_back_from_invalid_alt_offsets(tmp_path): + document = "Ana Perez firmo el escrito" + source_path = _write_pdf( + tmp_path / "invalid-alt.pdf", + lambda _doc, page: page.insert_text((72, 72), document), + ) + labels = [ + _label_dict( + "Ana Perez", + aymurai_alt_start_char=999, + aymurai_alt_end_char=1000, + ) + ] + + output_path = _run_pdf_anonymizer(tmp_path, source_path, document, labels) + + with pymupdf.open(output_path) as output_doc: + page_text = output_doc[0].get_text() + + assert "Ana Perez" not in page_text + assert "" in page_text + + +@pytest.mark.integration +def test_pdf_anonymizer_scrubs_pdf_payloads_and_preserves_safe_links(tmp_path): + document = "Ana Perez presento el escrito" + + def configure(doc: pymupdf.Document, page: pymupdf.Page) -> None: + page.insert_text((72, 72), document) + sensitive_rect = page.search_for("Ana Perez")[0] + page.insert_link( + { + "kind": pymupdf.LINK_URI, + "from": sensitive_rect, + "uri": "https://secret.example", + } + ) + safe_rect = pymupdf.Rect(72, 140, 180, 155) + page.insert_text((72, 150), "Portal publico") + page.insert_link( + { + "kind": pymupdf.LINK_URI, + "from": safe_rect, + "uri": "https://safe.example", + } + ) + page.add_file_annot((220, 72), b"attached secret", "attached.txt") + doc.set_metadata( + { + "title": "Secret title", + "author": "Secret author", + "subject": "Secret subject", + "keywords": "alpha,beta", + "creator": "Secret creator", + "producer": "Secret producer", + } + ) + doc.set_xml_metadata("top-secret") + doc.embfile_add("secret.txt", b"secret bytes", filename="secret.txt") + + source_path = _write_pdf(tmp_path / "metadata.pdf", configure) + labels = [_label_dict("Ana Perez")] + + output_path = _run_pdf_anonymizer(tmp_path, source_path, document, labels) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + link_uris = {link.get("uri") for link in page.get_links()} + + assert output_doc.metadata.get("title") == "" + assert output_doc.metadata.get("subject") == "" + assert output_doc.metadata.get("keywords") == "" + assert output_doc.metadata.get("creationDate") == "" + assert re.fullmatch( + r"D:\d{14}\+00'00'", + output_doc.metadata.get("modDate") or "", + ) + assert output_doc.metadata.get("trapped") == "" + assert output_doc.metadata.get("author") == "" + assert output_doc.metadata.get("creator") == "AymurAI" + assert output_doc.metadata.get("producer") == "AymurAI" + assert not output_doc.get_xml_metadata() + assert output_doc.embfile_names() == [] + assert list(page.annots() or []) == [] + assert "https://secret.example" not in link_uris + assert "https://safe.example" in link_uris + assert WATERMARK_URL in link_uris + + +@pytest.mark.integration +def test_pdf_anonymizer_removes_image_backed_entities(tmp_path): + source_path = _write_pdf( + tmp_path / "image.pdf", + lambda _doc, page: ( + page.insert_image(pymupdf.Rect(60, 60, 220, 110), stream=PNG_1X1), + page.insert_text((80, 90), "Ana Perez"), + ), + ) + + output_path = _run_pdf_anonymizer( + tmp_path, + source_path, + "Ana Perez", + [_label_dict("Ana Perez")], + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + page_text = page.get_text() + + assert page.get_image_info() == [] + assert "Ana Perez" not in page_text + assert "" in page_text + + +@pytest.mark.integration +def test_pdf_anonymizer_removes_signature_widgets_without_restoring_appearance( + tmp_path, +): + def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: + page.insert_text((80, 90), "Ana Perez") + widget = pymupdf.Widget() + widget.field_name = "sig_1" + widget.field_type = pymupdf.PDF_WIDGET_TYPE_SIGNATURE + widget.rect = pymupdf.Rect(60, 60, 220, 110) + page.add_widget(widget) + + source_path = _write_pdf(tmp_path / "signature.pdf", configure) + output_path = _run_pdf_anonymizer( + tmp_path, + source_path, + "Ana Perez", + [_label_dict("Ana Perez")], + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + page_text = page.get_text() + + assert list(page.widgets() or []) == [] + assert page.get_image_info() == [] + assert "Ana Perez" not in page_text + assert "" in page_text + + +@pytest.mark.integration +def test_docx_anonymizer_sets_aymurai_core_properties(tmp_path): + source_path = tmp_path / "source.docx" + document = Document() + document.add_paragraph("Ana Perez firmo el escrito") + document.core_properties.author = "Sensitive Author" + document.core_properties.last_modified_by = "Sensitive Modifier" + document.save(source_path) + + started_at = datetime.now(timezone.utc).replace(microsecond=0) + + output_path = DocxAnonymizer().anonymize( + {"path": str(source_path)}, + [ + { + "document": "Ana Perez firmo el escrito", + "labels": [_label_dict("Ana Perez")], + } + ], + str(tmp_path / "out"), + ) + + output_document = Document(output_path) + core_properties = output_document.core_properties + assert core_properties.author == "" + assert core_properties.last_modified_by == "AymurAI" + assert core_properties.modified is not None + modified = core_properties.modified + if modified.tzinfo is None: + modified = modified.replace(tzinfo=timezone.utc) + assert started_at <= modified <= datetime.now(timezone.utc) + timedelta(seconds=5) + @pytest.mark.integration @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline") @@ -293,6 +523,40 @@ def test_should_return_validation_when_paragraph_exists(client, db_session): @pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") +def test_should_return_application_pdf_when_pdf_document_is_anonymized( + mock_get_anonymizer, + client, + tmp_path, +): + anonymized_path = _write_pdf( + tmp_path / "output.pdf", + lambda _doc, page: page.insert_text((72, 72), "Anonymized PDF output"), + ) + mock_get_anonymizer.return_value = MagicMock(return_value=str(anonymized_path)) + + annotations = { + "data": [ + { + "document": "Ana Perez presento el escrito", + "labels": [build_label("PER", "Ana Perez").model_dump(mode="json")], + } + ], + "label_policies": {"PER": {"anonymize": True, "disambiguation": "none"}}, + "render_policy": {"suffix_mode": "auto", "suffix_threshold": 1}, + } + + response = client.post( + "/anonymizer/anonymize-document", + data={"annotations": json.dumps(annotations)}, + files={"file": ("sample.pdf", b"%PDF-1.4 fake", "application/pdf")}, + ) + + assert response.status_code == 200 + assert response.headers["content-type"] == "application/pdf" + assert len(response.content) > 0 + + @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.subprocess.check_output") @patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.get_anonymizer") def test_should_anonymize_document_when_annotations_are_valid( From cbbd9071c581136442efa34a9098708ab9c8fdc9 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 17:50:03 +0000 Subject: [PATCH 25/28] =?UTF-8?q?=E2=9C=A8=20Add=20watermark=20layout=20ad?= =?UTF-8?q?justments=20to=20avoid=20footer=20content=20overlap=20in=20PDF?= =?UTF-8?q?=20anonymization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/pdf/watermark.py | 243 ++++++++++++++++++-- 1 file changed, 225 insertions(+), 18 deletions(-) diff --git a/aymurai/text/anonymization/pdf/watermark.py b/aymurai/text/anonymization/pdf/watermark.py index 50b2b2e..c15d9ae 100644 --- a/aymurai/text/anonymization/pdf/watermark.py +++ b/aymurai/text/anonymization/pdf/watermark.py @@ -19,6 +19,10 @@ WATERMARK_FONT_SIZE = 10.0 WATERMARK_MARGIN_X = 24.0 WATERMARK_BASELINE_MARGIN = 12.0 +WATERMARK_TOP_BASELINE = 22.0 +WATERMARK_RECT_PADDING_X = 4.0 +WATERMARK_RECT_PADDING_Y = 4.0 +WATERMARK_COLLISION_PADDING = 12.0 WATERMARK_TEXT_COLOR = tuple(channel / 255 for channel in (192, 192, 192)) WATERMARK_LINK_COLOR = tuple(channel / 255 for channel in (115, 190, 250)) @@ -240,9 +244,214 @@ def _insert_watermark_text( page.insert_text(point, text, **kwargs) +def _expanded_rect(rect: pymupdf.Rect, padding: float) -> pymupdf.Rect: + """ + Expands a rectangle by a uniform padding in every direction. + + Args: + rect (pymupdf.Rect): The rectangle to expand. + padding (float): The amount of padding to apply on every side. + + Returns: + pymupdf.Rect: The expanded rectangle. + """ + return pymupdf.Rect( + rect.x0 - padding, + rect.y0 - padding, + rect.x1 + padding, + rect.y1 + padding, + ) + + +def _watermark_corner_order(page_index: int) -> list[str]: + """ + Builds the preferred watermark corner order for a page. + + Args: + page_index (int): The page index being processed. + + Returns: + list[str]: The ordered watermark corner candidates for the page. + """ + if page_index % 2 == 0: + return ["bottom-right", "bottom-left", "top-left", "top-right"] + return ["bottom-left", "top-left", "top-right", "bottom-right"] + + +def _watermark_layout_for_corner( + page: pymupdf.Page, + corner: str, + *, + prefix_width: float, + link_width: float, + total_width: float, +) -> dict[str, Any]: + """ + Builds the watermark geometry for a specific page corner. + + Args: + page (pymupdf.Page): The PDF page being processed. + corner (str): The corner identifier used to position the watermark. + prefix_width (float): The rendered width of the watermark prefix text. + link_width (float): The rendered width of the watermark link text. + total_width (float): The total rendered width of the watermark text. + + Returns: + dict[str, Any]: The watermark layout data for the corner. + """ + if corner.endswith("right"): + x_start = max( + WATERMARK_MARGIN_X, + page.rect.width - total_width - WATERMARK_MARGIN_X, + ) + else: + x_start = WATERMARK_MARGIN_X + + if corner.startswith("bottom"): + baseline_y = page.rect.height - WATERMARK_BASELINE_MARGIN + else: + baseline_y = WATERMARK_TOP_BASELINE + + link_x = x_start + prefix_width + text_top = baseline_y - WATERMARK_FONT_SIZE + banner_rect = pymupdf.Rect( + x_start - WATERMARK_RECT_PADDING_X, + text_top - WATERMARK_RECT_PADDING_Y, + x_start + total_width + WATERMARK_RECT_PADDING_X, + baseline_y + WATERMARK_RECT_PADDING_Y, + ) + link_rect = pymupdf.Rect( + link_x, + text_top, + link_x + link_width, + baseline_y + 2.0, + ) + + return { + "corner": corner, + "x_start": x_start, + "baseline_y": baseline_y, + "link_x": link_x, + "banner_rect": banner_rect, + "link_rect": link_rect, + } + + +def _occupied_page_rects(page: pymupdf.Page) -> list[pymupdf.Rect]: + """ + Collects page rectangles already occupied by visible content. + + Args: + page (pymupdf.Page): The PDF page being processed. + + Returns: + list[pymupdf.Rect]: The occupied rectangles found on the page. + """ + occupied: list[pymupdf.Rect] = [] + + text_data = page.get_text("dict") + for block in text_data.get("blocks", []): + bbox = block.get("bbox") + if bbox is None: + continue + rect = pymupdf.Rect(bbox) + if rect.get_area() <= 0: + continue + occupied.append(_expanded_rect(rect, WATERMARK_COLLISION_PADDING)) + + for drawing in page.get_drawings(): + rect = drawing.get("rect") + if rect is None: + continue + rect = pymupdf.Rect(rect) + if rect.get_area() <= 0: + continue + occupied.append(_expanded_rect(rect, WATERMARK_COLLISION_PADDING)) + + return occupied + + +def _watermark_overlap_score( + banner_rect: pymupdf.Rect, + occupied_rects: list[pymupdf.Rect], +) -> tuple[float, float, int]: + """ + Scores a watermark placement by the amount of page content it overlaps. + + Args: + banner_rect (pymupdf.Rect): The watermark banner rectangle being scored. + occupied_rects (list[pymupdf.Rect]): The occupied page rectangles used for overlap checks. + + Returns: + tuple[float, float, int]: The overlap ratio, overlap area, and overlap count for the placement. + """ + overlap_area = 0.0 + overlap_count = 0 + banner_area = max(banner_rect.get_area(), 1.0) + + for rect in occupied_rects: + if not banner_rect.intersects(rect): + continue + intersection = banner_rect & rect + area = intersection.get_area() + if area <= 0: + continue + overlap_area += area + overlap_count += 1 + + return overlap_area / banner_area, overlap_area, overlap_count + + +def _choose_watermark_layout( + page: pymupdf.Page, + page_index: int, + *, + prefix_width: float, + link_width: float, + total_width: float, +) -> dict[str, Any]: + """ + Selects the watermark placement with the least overlap on a page. + + Args: + page (pymupdf.Page): The PDF page being processed. + page_index (int): The page index being processed. + prefix_width (float): The rendered width of the watermark prefix text. + link_width (float): The rendered width of the watermark link text. + total_width (float): The total rendered width of the watermark text. + + Returns: + dict[str, Any]: The chosen watermark layout data. + """ + occupied_rects = _occupied_page_rects(page) + candidate_layouts = [ + _watermark_layout_for_corner( + page, + corner, + prefix_width=prefix_width, + link_width=link_width, + total_width=total_width, + ) + for corner in _watermark_corner_order(page_index) + ] + + best_layout = candidate_layouts[0] + best_score: tuple[float, float, int] | None = None + + for layout in candidate_layouts: + score = _watermark_overlap_score(layout["banner_rect"], occupied_rects) + if score[0] == 0.0 and score[1] == 0.0: + return layout + if best_score is None or score < best_score: + best_layout = layout + best_score = score + + return best_layout + + def add_pdf_footer_watermark(doc: pymupdf.Document) -> None: """ - Adds the anonymization watermark to the footer of each PDF page. + Adds the anonymization watermark to the least crowded corner of each PDF page. Args: doc (pymupdf.Document): The PDF document being processed. @@ -263,16 +472,16 @@ def add_pdf_footer_watermark(doc: pymupdf.Document) -> None: total_width = prefix_width + link_width for page_index, page in enumerate(doc): - if page_index % 2 == 0: - x_start = max( - WATERMARK_MARGIN_X, - page.rect.width - total_width - WATERMARK_MARGIN_X, - ) - else: - x_start = WATERMARK_MARGIN_X - - baseline_y = page.rect.height - WATERMARK_BASELINE_MARGIN - link_x = x_start + prefix_width + layout = _choose_watermark_layout( + page, + page_index, + prefix_width=prefix_width, + link_width=link_width, + total_width=total_width, + ) + baseline_y = layout["baseline_y"] + x_start = layout["x_start"] + link_x = layout["link_x"] _insert_watermark_text( page, @@ -293,7 +502,10 @@ def add_pdf_footer_watermark(doc: pymupdf.Document) -> None: fontfile=font_config["link_fontfile"], ) - underline_y = min(page.rect.height - 1.0, baseline_y + 1.0) + if layout["corner"].startswith("bottom"): + underline_y = min(page.rect.height - 1.0, baseline_y + 1.0) + else: + underline_y = baseline_y + 1.0 page.draw_line( (link_x, underline_y), (link_x + link_width, underline_y), @@ -304,12 +516,7 @@ def add_pdf_footer_watermark(doc: pymupdf.Document) -> None: page.insert_link( { "kind": pymupdf.LINK_URI, - "from": pymupdf.Rect( - link_x, - baseline_y - WATERMARK_FONT_SIZE, - link_x + link_width, - min(page.rect.height, baseline_y + 2.0), - ), + "from": layout["link_rect"], "uri": WATERMARK_URL, } ) From 4262fe7c9fe82f28608d76dba4a11c6a0397cb0b Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 17:50:15 +0000 Subject: [PATCH 26/28] =?UTF-8?q?=E2=9C=85=20Add=20integration=20test=20to?= =?UTF-8?q?=20ensure=20watermark=20is=20positioned=20away=20from=20footer?= =?UTF-8?q?=20content=20in=20PDF=20anonymization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 321f32f..0e26c49 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -151,6 +151,35 @@ def configure(doc: pymupdf.Document, page: pymupdf.Page) -> None: assert WATERMARK_URL in link_uris +@pytest.mark.integration +def test_pdf_anonymizer_moves_watermark_away_from_footer_content(tmp_path): + document = "Ana Perez presento el escrito" + footer_rect = pymupdf.Rect(360, 760, 575, 815) + + def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: + page.insert_text((72, 72), document) + page.draw_rect(footer_rect, color=(0, 0, 0), fill=(0, 0, 0), overlay=True) + + source_path = _write_pdf(tmp_path / "footer-watermark.pdf", configure) + output_path = _run_pdf_anonymizer( + tmp_path, + source_path, + document, + [_label_dict("Ana Perez")], + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + watermark_links = [ + link for link in page.get_links() if link.get("uri") == WATERMARK_URL + ] + + assert len(watermark_links) == 1 + watermark_rect = pymupdf.Rect(watermark_links[0]["from"]) + assert not watermark_rect.intersects(footer_rect) + assert watermark_rect.x1 < footer_rect.x0 + + @pytest.mark.integration def test_pdf_anonymizer_removes_image_backed_entities(tmp_path): source_path = _write_pdf( From 7d8c1d346c9a6c605ecfb09a8f7eb3669315e3f6 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 18:18:18 +0000 Subject: [PATCH 27/28] =?UTF-8?q?=F0=9F=A9=B9=20Fix:=20read=20docx=20xml?= =?UTF-8?q?=20as=20utf-8=20across=20platforms?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- aymurai/text/anonymization/alignment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aymurai/text/anonymization/alignment.py b/aymurai/text/anonymization/alignment.py index 49e4955..e4f2547 100644 --- a/aymurai/text/anonymization/alignment.py +++ b/aymurai/text/anonymization/alignment.py @@ -338,7 +338,7 @@ def index_paragraphs(file: str) -> list[dict]: list[dict]: A list of dictionaries representing the indexed paragraphs. """ # Read the XML file - with open(file) as f: + with open(file, encoding="utf-8-sig") as f: xml = f.read() paragraphs = [] From 107628c223a77ae0682b178977485903ec3c7bc9 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Fri, 17 Apr 2026 18:19:22 +0000 Subject: [PATCH 28/28] =?UTF-8?q?=E2=9C=85=20Add=20Windows-specific=20xfai?= =?UTF-8?q?l=20marker=20for=20PDF=20tests=20and=20implement=20UTF-8=20XML?= =?UTF-8?q?=20reading=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 0e26c49..e003ad3 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -2,6 +2,7 @@ import json import re import subprocess +import sys from datetime import datetime, timedelta, timezone from pathlib import Path from unittest.mock import MagicMock, patch @@ -13,6 +14,7 @@ from aymurai.database.schema import AnonymizationParagraph from aymurai.database.utils import text_to_uuid from aymurai.text.anonymization import DocxAnonymizer, PdfAnonymizer, get_anonymizer +from aymurai.text.anonymization.alignment import index_paragraphs from tests.api.conftest import build_label from tests.api.routers.conftest import build_mock_pipeline @@ -21,6 +23,12 @@ ) WATERMARK_URL = "https://www.aymurai.info/" +WINDOWS_PYMUPDF_LAYOUT_XFAIL = pytest.mark.xfail( + sys.platform == "win32", + reason="pymupdf4llm ONNX layout model receives int32 tensors on Windows (expects int64)", + strict=False, +) + def _write_pdf(path: Path, configure) -> Path: doc = pymupdf.open() @@ -62,6 +70,7 @@ def test_anonymization_package_exports_and_registry_are_stable(): @pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL def test_pdf_anonymizer_falls_back_from_invalid_alt_offsets(tmp_path): document = "Ana Perez firmo el escrito" source_path = _write_pdf( @@ -86,6 +95,7 @@ def test_pdf_anonymizer_falls_back_from_invalid_alt_offsets(tmp_path): @pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL def test_pdf_anonymizer_scrubs_pdf_payloads_and_preserves_safe_links(tmp_path): document = "Ana Perez presento el escrito" @@ -152,6 +162,7 @@ def configure(doc: pymupdf.Document, page: pymupdf.Page) -> None: @pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL def test_pdf_anonymizer_moves_watermark_away_from_footer_content(tmp_path): document = "Ana Perez presento el escrito" footer_rect = pymupdf.Rect(360, 760, 575, 815) @@ -181,6 +192,7 @@ def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: @pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL def test_pdf_anonymizer_removes_image_backed_entities(tmp_path): source_path = _write_pdf( tmp_path / "image.pdf", @@ -207,6 +219,7 @@ def test_pdf_anonymizer_removes_image_backed_entities(tmp_path): @pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL def test_pdf_anonymizer_removes_signature_widgets_without_restoring_appearance( tmp_path, ): @@ -236,6 +249,24 @@ def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: assert "" in page_text +def test_index_paragraphs_reads_docx_xml_as_utf8(tmp_path): + xml_path = tmp_path / "document.xml" + xml_path.write_bytes( + """ + + + Señora — resolución + + +""".encode("utf-8") + ) + + paragraphs = index_paragraphs(str(xml_path)) + + assert len(paragraphs) == 1 + assert paragraphs[0]["plain_text"] == "Señora — resolución" + + @pytest.mark.integration def test_docx_anonymizer_sets_aymurai_core_properties(tmp_path): source_path = tmp_path / "source.docx"