Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
78a296c
✨ feat(extractors): use pymupdf layout for pdf text extraction
jansaldo Mar 17, 2026
ff7c9d3
✨ feat(normalization): enhance document normalization to preserve par…
jansaldo Mar 17, 2026
6243dae
📝 docs: document default values for extractor and normalization helpers
jansaldo Mar 17, 2026
eda11cc
🩹 fix(extractors): use pymupdf4llm.to_text with page_chunks for pdf p…
jansaldo Mar 17, 2026
bad66a0
♻️ Add DOCX and PDF anonymizer modules
jansaldo Mar 30, 2026
8759a79
🔧 Enhance PDF and DOCX handling in anonymization process
jansaldo Mar 30, 2026
c608750
📝 Update backend module references for document rendering in README
jansaldo Mar 30, 2026
0dec423
✅ Update tests to use DOCX format for document anonymization and enha…
jansaldo Mar 30, 2026
c107647
✨ Add end-to-end PDF anonymization notebook with PyMuPDF and AymurAI API
jansaldo Mar 30, 2026
f1ac135
♻️ Rework PDF anonymization for precise spans and widget handling
jansaldo Apr 6, 2026
cbcc235
🔧 Update model_dump calls to exclude None values for improved data ha…
jansaldo Apr 9, 2026
b452034
📝 Add docstrings to label replacement functions
jansaldo Apr 9, 2026
f3f9f34
♻️ Refactor watermark handling and optimize PDF token aliasing
jansaldo Apr 9, 2026
8d41f7e
✅ Add integration tests for merging fragmented numeric labels and exc…
jansaldo Apr 9, 2026
e665edb
➖ Remove opencv-python-headless dependency from project requirements
jansaldo Apr 9, 2026
713e4ee
♻️ Implement paragraph splitting function to enhance document text ex…
jansaldo Apr 9, 2026
ef3f672
🔧 Update dependency installation command to prevent Python downloads
jansaldo Apr 9, 2026
7866914
🔥 Remove redundant tests for merging fragmented numeric labels and PD…
jansaldo Apr 9, 2026
dd1153d
♻️ Refactor anonymizer tests to use DOCX format and enhance mock func…
jansaldo Apr 9, 2026
c37ba34
🔧 Add xfail marker for PDF extraction test on Windows due to tensor t…
jansaldo Apr 9, 2026
620540b
✨ Enhance PDF anonymization by adding cleanup rects, removing overlap…
jansaldo Apr 10, 2026
9c11eb1
🔧 Remove redundant return statement in _label_replacement_text function
jansaldo Apr 17, 2026
435b305
♻️ Refactor anonymization module: split pdf and docx internals by format
jansaldo Apr 17, 2026
783a68f
✅ Add integration tests for PDF and DOCX anonymizers, including metad…
jansaldo Apr 17, 2026
cbbd907
✨ Add watermark layout adjustments to avoid footer content overlap in…
jansaldo Apr 17, 2026
4262fe7
✅ Add integration test to ensure watermark is positioned away from fo…
jansaldo Apr 17, 2026
7d8c1d3
🩹 Fix: read docx xml as utf-8 across platforms
jansaldo Apr 17, 2026
107628c
✅ Add Windows-specific xfail marker for PDF tests and implement UTF-8…
jansaldo Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 54 additions & 48 deletions aymurai/api/endpoints/routers/anonymizer/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from threading import Lock

import torch
from fastapi import Body, Depends, Form, Query, UploadFile
from fastapi import Body, Depends, Form, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse
from fastapi.routing import APIRouter
from sqlmodel import Session
Expand All @@ -31,7 +31,10 @@
TextRequest,
)
from aymurai.settings import settings
from aymurai.text.anonymization import DocAnonymizer, replace_labels_in_text
from aymurai.text.anonymization import (
InvalidDocumentAnonymizer,
get_anonymizer,
)
from aymurai.text.extraction import MIMETYPE_EXTENSION_MAPPER
from aymurai.utils.entity_disambiguation import (
build_canonical_entities,
Expand Down Expand Up @@ -514,11 +517,21 @@ async def anonymizer_compile_document(
"""
logger.info(f"receiving => {file.filename}")
extension = MIMETYPE_EXTENSION_MAPPER.get(file.content_type)
logger.info(f"detection extension: {extension} ({file.content_type})")
file_suffix = os.path.splitext(file.filename or "")[1].lower()

if extension is None and file_suffix:
extension = file_suffix.lstrip(".")

if extension not in {"docx", "pdf"}:
raise HTTPException(
status_code=400,
detail=f"Unsupported format for anonymization: {extension or 'unknown'}",
)

logger.info(f"detected extension: {extension} ({file.content_type})")

# Create a temporary file
_, suffix = os.path.splitext(file.filename)
suffix = suffix if suffix == ".docx" else ".txt"
suffix = f".{extension}"
tmp_dir = tempfile.gettempdir()

# Use delete=False to avoid the file being deleted when the NamedTemporaryFile object is closed
Expand All @@ -537,7 +550,7 @@ async def anonymizer_compile_document(

annots_json = json.loads(annotations)
annots = DocumentAnnotations.model_validate(annots_json)
logger.info(f"processing annotations => {annots}")

effective_label_policies = _merge_label_policies(annots.label_policies)
effective_render_policy = _merge_render_policy(annots.render_policy)

Expand All @@ -562,9 +575,6 @@ async def anonymizer_compile_document(
override=False,
)

# Anonymize the document
doc_anonymizer = DocAnonymizer()

filtered_annotations = []
for paragraph in annots.data:
filtered_labels = [
Expand All @@ -583,70 +593,66 @@ async def anonymizer_compile_document(
filtered_annotations, effective_render_policy, effective_label_policies
)

if suffix == ".docx":
item = {"path": tmp_filename}
doc_anonymizer.render_context = render_context
doc_anonymizer(
item,
[
document_information.model_dump()
for document_information in filtered_annotations
],
preds = [
document_information.model_dump(mode="json", exclude_none=True)
for document_information in filtered_annotations
]

try:
anonymizer = get_anonymizer(extension)
anonymized_path = anonymizer(
{"path": tmp_filename},
preds,
tmp_dir,
render_context=render_context,
)
except (ValueError, InvalidDocumentAnonymizer) as exc:
if os.path.exists(tmp_filename):
os.remove(tmp_filename)
raise HTTPException(status_code=400, detail=str(exc)) from exc

if extension == "pdf":
if os.path.exists(tmp_filename):
os.remove(tmp_filename)

return FileResponse(
anonymized_path,
background=BackgroundTask(os.remove, anonymized_path),
media_type="application/pdf",
filename=f"{os.path.splitext(file.filename)[0]}.pdf",
)
logger.info(f"saved temp file on local storage => {tmp_filename}")

else:
# Export as raw document
anonymized_doc = [
replace_labels_in_text(
document_information.model_dump(),
render_context=render_context,
)
.replace("&lt;", "<")
.replace("&gt;", ">")
for document_information in filtered_annotations
]
with open(tmp_filename, "w") as f:
f.write("\n".join(anonymized_doc))

# Add watermark to the end of the document
f.write(
"\n\nDocumento anonimizado por AymurAI\n\nhttps://www.aymurai.info/"
)

# Convert to ODT
# DOCX flow keeps ODT output
cmd = [
settings.LIBREOFFICE_BIN,
"--headless",
"--convert-to",
"odt",
"--outdir",
tmp_dir,
tmp_filename,
anonymized_path,
]

logger.info(f"Executing: {' '.join(cmd)}")

try:
output = subprocess.check_output(
cmd, shell=False, encoding="utf-8", errors="ignore"
)
logger.info(f"LibreOffice output: {output}")
except subprocess.CalledProcessError as e:
except subprocess.CalledProcessError as exc:
raise RuntimeError(
f"LibreOffice conversion failed: {e.output.decode('utf-8', errors='ignore')}"
)
f"LibreOffice conversion failed: {exc.output.decode('utf-8', errors='ignore')}"
) from exc
finally:
if os.path.exists(tmp_filename):
os.remove(tmp_filename)

odt = tmp_filename.replace(suffix, ".odt")
odt = f"{os.path.splitext(anonymized_path)[0]}.odt"
logger.info(f"Expected output file path: {odt}")

if not os.path.exists(odt):
raise RuntimeError(f"File at path {odt} does not exist.")

# Ensure the temporary file is deleted
os.remove(tmp_filename)

return FileResponse(
odt,
background=BackgroundTask(os.remove, odt),
Expand Down
10 changes: 7 additions & 3 deletions aymurai/api/endpoints/routers/misc/document_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def extraction(path: str) -> str:
str: Extracted text from the document.
"""
text = extract_document(path)
return document_normalize(text) if text else ""
return document_normalize(text, preserve_paragraphs=True) if text else ""


def run_safe_text_extraction(
Expand Down Expand Up @@ -112,8 +112,12 @@ def plain_text_extractor(file: UploadFile) -> Document:

document_id = data_to_uuid(data)

paragraphs = [line.strip() for line in document.split("\n") if line.strip()]
paragraphs = [re.sub(r"\s{2,}", " ", line) for line in paragraphs]
paragraphs = [
paragraph.strip()
for paragraph in re.split(r"\n\s*\n+", document)
if paragraph.strip()
]
paragraphs = [re.sub(r"[ \t]{2,}", " ", paragraph) for paragraph in paragraphs]
paragraphs = list(unique_justseen(paragraphs))

return Document(document=paragraphs, document_id=document_id)
8 changes: 4 additions & 4 deletions aymurai/database/crud/anonymization/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _serialize_doclabels(value: list[DocLabel] | None):
"""
if value is None:
return None
return _DOC_LABELS_ADAPTER.dump_python(value, mode="json")
return _DOC_LABELS_ADAPTER.dump_python(value, mode="json", exclude_none=True)


def _normalize_paragraph_payload(payload: dict) -> dict:
Expand Down Expand Up @@ -63,7 +63,7 @@ def anonymization_paragraph_create(
Returns:
AnonymizationParagraph: The persisted paragraph record.
"""
payload = _normalize_paragraph_payload(paragraph_in.model_dump())
payload = _normalize_paragraph_payload(paragraph_in.model_dump(exclude_none=True))
new_paragraph = AnonymizationParagraph(**payload)

if override:
Expand Down Expand Up @@ -171,14 +171,14 @@ def anonymization_paragraph_batch_create_update(

paragraph = session.get(AnonymizationParagraph, paragraph_id)
if paragraph:
payload = _normalize_paragraph_payload(p_in.model_dump())
payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
payload.pop("id", None)
for field, value in payload.items():
if value is not None:
setattr(paragraph, field, value)

else:
payload = _normalize_paragraph_payload(p_in.model_dump())
payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
paragraph = AnonymizationParagraph(**payload)

session.add(paragraph)
Expand Down
18 changes: 16 additions & 2 deletions aymurai/text/anonymization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
from aymurai.text.anonymization.alignment import replace_labels_in_text
from aymurai.text.anonymization.doc_anonymizer import DocAnonymizer
from aymurai.text.anonymization.base import (
BaseAnonymizer,
InvalidDocumentAnonymizer,
get_anonymizer,
register_anonymizer,
supported_extensions,
)
from aymurai.text.anonymization.docx import DocxAnonymizer
from aymurai.text.anonymization.pdf import PdfAnonymizer

__all__ = [
"DocAnonymizer",
"BaseAnonymizer",
"DocxAnonymizer",
"PdfAnonymizer",
"InvalidDocumentAnonymizer",
"get_anonymizer",
"register_anonymizer",
"supported_extensions",
"replace_labels_in_text",
]
80 changes: 74 additions & 6 deletions aymurai/text/anonymization/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from joblib import hash
from more_itertools import flatten

from aymurai.meta.api_interfaces import LabelPolicy
from aymurai.models.flair.utils import FlairTextNormalize
from aymurai.utils.alignment.core import align_text, tokenize
from aymurai.meta.api_interfaces import LabelPolicy

REGEX_PARAGRAPH = r"((?<!\/)w:p\b)(?P<paragraph>.*?)(\/w:p\b)"
REGEX_FRAGMENT = r"(?<!\/)w:t\b.*?>(?P<text>.*?)(<.*?\/w:t)"
Expand Down Expand Up @@ -61,6 +61,72 @@ def resolve_render_token(label: dict, render_context: dict | None = None) -> str
return f"{base}_{index}"


def _label_replacement_start(label: dict) -> int:
"""
Determines the start character index for a label, considering possible alternative attributes.

Args:
label (dict): Label dictionary which may contain alternative start character attributes.

Returns:
int: The start character index for the label.
"""
attrs = label.get("attrs") or {}
alt_start = attrs.get("aymurai_alt_start_char")
start_char = label.get("start_char")
return int(alt_start if alt_start is not None else (start_char or 0))


def _label_replacement_end(label: dict) -> int:
"""
Determines the end character index for a label, considering possible alternative attributes.

Args:
label (dict): Label dictionary which may contain alternative end character attributes.

Returns:
int: The end character index for the label.
"""
attrs = label.get("attrs") or {}
alt_end = attrs.get("aymurai_alt_end_char")
end_char = label.get("end_char")
return int(alt_end if alt_end is not None else (end_char or 0))


def _label_replacement_text(label: dict, document: str) -> str:
"""
Determines the replacement text for a label, considering possible alternative attributes.

Args:
label (dict): Label dictionary which may contain alternative text attributes.
document (str): The document text from which to extract the label text.

Returns:
str: The text for the label, considering possible alternative attributes.
"""
attrs = label.get("attrs") or {}

alt_text = attrs.get("aymurai_alt_text")
if alt_text is not None:
return str(alt_text) if alt_text else ""

alt_start = attrs.get("aymurai_alt_start_char")
alt_end = attrs.get("aymurai_alt_end_char")
if alt_start is not None and alt_end is not None:
start_char, end_char = int(alt_start), int(alt_end)
if 0 <= start_char < end_char <= len(document):
return document[start_char:end_char]
return ""

start_char = int(label.get("start_char") or 0)
end_char = int(label.get("end_char") or 0)
if 0 <= start_char < end_char <= len(document):
return document[start_char:end_char]

text = label.get("text")
return str(text) if text else ""


def unify_consecutive_labels(
sample: dict,
text_key: str = "document",
Expand Down Expand Up @@ -93,9 +159,11 @@ def unify_consecutive_labels(
# Iterate over labels
for label in labels:
# Get attributes
text = label["attrs"]["aymurai_alt_text"] or label["text"]
start_char = label["attrs"]["aymurai_alt_start_char"] or label["start_char"]
end_char = label["attrs"]["aymurai_alt_end_char"] or label["end_char"]
text = _label_replacement_text(label, document)
start_char = _label_replacement_start(label)
end_char = _label_replacement_end(label)
if not text or end_char <= start_char:
continue
aymurai_label = resolve_render_token(label, render_context)

if current_group is None:
Expand All @@ -115,7 +183,7 @@ def unify_consecutive_labels(
else:
# Finish the current group and start a new one
current_group["text"] = document[
current_group["start_char"] : current_group["end_char"] + 1
current_group["start_char"] : current_group["end_char"]
]
unified_labels.append(current_group)
current_group = {
Expand All @@ -128,7 +196,7 @@ def unify_consecutive_labels(
# Finish the last group
if current_group is not None:
current_group["text"] = document[
current_group["start_char"] : current_group["end_char"] + 1
current_group["start_char"] : current_group["end_char"]
]
unified_labels.append(current_group)

Expand Down
Loading
Loading