Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
78a296c
✨ feat(extractors): use pymupdf layout for pdf text extraction
jansaldo Mar 17, 2026
ff7c9d3
✨ feat(normalization): enhance document normalization to preserve par…
jansaldo Mar 17, 2026
6243dae
📝 docs: document default values for extractor and normalization helpers
jansaldo Mar 17, 2026
eda11cc
🩹 fix(extractors): use pymupdf4llm.to_text with page_chunks for pdf p…
jansaldo Mar 17, 2026
bad66a0
♻️ Add DOCX and PDF anonymizer modules
jansaldo Mar 30, 2026
8759a79
🔧 Enhance PDF and DOCX handling in anonymization process
jansaldo Mar 30, 2026
c608750
📝 Update backend module references for document rendering in README
jansaldo Mar 30, 2026
0dec423
✅ Update tests to use DOCX format for document anonymization and enha…
jansaldo Mar 30, 2026
c107647
✨ Add end-to-end PDF anonymization notebook with PyMuPDF and AymurAI API
jansaldo Mar 30, 2026
f1ac135
♻️ Rework PDF anonymization for precise spans and widget handling
jansaldo Apr 6, 2026
cbcc235
🔧 Update model_dump calls to exclude None values for improved data ha…
jansaldo Apr 9, 2026
b452034
📝 Add docstrings to label replacement functions
jansaldo Apr 9, 2026
f3f9f34
♻️ Refactor watermark handling and optimize PDF token aliasing
jansaldo Apr 9, 2026
8d41f7e
✅ Add integration tests for merging fragmented numeric labels and exc…
jansaldo Apr 9, 2026
e665edb
➖ Remove opencv-python-headless dependency from project requirements
jansaldo Apr 9, 2026
713e4ee
♻️ Implement paragraph splitting function to enhance document text ex…
jansaldo Apr 9, 2026
ef3f672
🔧 Update dependency installation command to prevent Python downloads
jansaldo Apr 9, 2026
7866914
🔥 Remove redundant tests for merging fragmented numeric labels and PD…
jansaldo Apr 9, 2026
dd1153d
♻️ Refactor anonymizer tests to use DOCX format and enhance mock func…
jansaldo Apr 9, 2026
c37ba34
🔧 Add xfail marker for PDF extraction test on Windows due to tensor t…
jansaldo Apr 9, 2026
620540b
✨ Enhance PDF anonymization by adding cleanup rects, removing overlap…
jansaldo Apr 10, 2026
9c11eb1
🔧 Remove redundant return statement in _label_replacement_text function
jansaldo Apr 17, 2026
435b305
♻️ Refactor anonymization module: split pdf and docx internals by format
jansaldo Apr 17, 2026
783a68f
✅ Add integration tests for PDF and DOCX anonymizers, including metad…
jansaldo Apr 17, 2026
cbbd907
✨ Add watermark layout adjustments to avoid footer content overlap in…
jansaldo Apr 17, 2026
4262fe7
✅ Add integration test to ensure watermark is positioned away from fo…
jansaldo Apr 17, 2026
7d8c1d3
🩹 Fix: read docx xml as utf-8 across platforms
jansaldo Apr 17, 2026
107628c
✅ Add Windows-specific xfail marker for PDF tests and implement UTF-8…
jansaldo Apr 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ jobs:

- name: Install dependencies
run: |
uv sync --frozen --python python --no-dev --no-managed-python --group tests
uv sync --frozen --python python --no-dev --no-python-downloads --group tests

- name: Run api tests
env:
Expand Down
102 changes: 54 additions & 48 deletions aymurai/api/endpoints/routers/anonymizer/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from threading import Lock

import torch
from fastapi import Body, Depends, Form, Query, UploadFile
from fastapi import Body, Depends, Form, HTTPException, Query, UploadFile
from fastapi.responses import FileResponse
from fastapi.routing import APIRouter
from sqlmodel import Session
Expand All @@ -31,7 +31,10 @@
TextRequest,
)
from aymurai.settings import settings
from aymurai.text.anonymization import DocAnonymizer, replace_labels_in_text
from aymurai.text.anonymization import (
InvalidDocumentAnonymizer,
get_anonymizer,
)
from aymurai.text.extraction import MIMETYPE_EXTENSION_MAPPER
from aymurai.utils.entity_disambiguation import (
build_canonical_entities,
Expand Down Expand Up @@ -514,11 +517,21 @@ async def anonymizer_compile_document(
"""
logger.info(f"receiving => {file.filename}")
extension = MIMETYPE_EXTENSION_MAPPER.get(file.content_type)
logger.info(f"detection extension: {extension} ({file.content_type})")
file_suffix = os.path.splitext(file.filename or "")[1].lower()

if extension is None and file_suffix:
extension = file_suffix.lstrip(".")

if extension not in {"docx", "pdf"}:
raise HTTPException(
status_code=400,
detail=f"Unsupported format for anonymization: {extension or 'unknown'}",
)

logger.info(f"detected extension: {extension} ({file.content_type})")

# Create a temporary file
_, suffix = os.path.splitext(file.filename)
suffix = suffix if suffix == ".docx" else ".txt"
suffix = f".{extension}"
tmp_dir = tempfile.gettempdir()

# Use delete=False to avoid the file being deleted when the NamedTemporaryFile object is closed
Expand All @@ -537,7 +550,7 @@ async def anonymizer_compile_document(

annots_json = json.loads(annotations)
annots = DocumentAnnotations.model_validate(annots_json)
logger.info(f"processing annotations => {annots}")

effective_label_policies = _merge_label_policies(annots.label_policies)
effective_render_policy = _merge_render_policy(annots.render_policy)

Expand All @@ -562,9 +575,6 @@ async def anonymizer_compile_document(
override=False,
)

# Anonymize the document
doc_anonymizer = DocAnonymizer()

filtered_annotations = []
for paragraph in annots.data:
filtered_labels = [
Expand All @@ -583,70 +593,66 @@ async def anonymizer_compile_document(
filtered_annotations, effective_render_policy, effective_label_policies
)

if suffix == ".docx":
item = {"path": tmp_filename}
doc_anonymizer.render_context = render_context
doc_anonymizer(
item,
[
document_information.model_dump()
for document_information in filtered_annotations
],
preds = [
document_information.model_dump(mode="json", exclude_none=True)
for document_information in filtered_annotations
]

try:
anonymizer = get_anonymizer(extension)
anonymized_path = anonymizer(
{"path": tmp_filename},
preds,
tmp_dir,
render_context=render_context,
)
except (ValueError, InvalidDocumentAnonymizer) as exc:
if os.path.exists(tmp_filename):
os.remove(tmp_filename)
raise HTTPException(status_code=400, detail=str(exc)) from exc

if extension == "pdf":
if os.path.exists(tmp_filename):
os.remove(tmp_filename)

return FileResponse(
anonymized_path,
background=BackgroundTask(os.remove, anonymized_path),
media_type="application/pdf",
filename=f"{os.path.splitext(file.filename)[0]}.pdf",
)
logger.info(f"saved temp file on local storage => {tmp_filename}")

else:
# Export as raw document
anonymized_doc = [
replace_labels_in_text(
document_information.model_dump(),
render_context=render_context,
)
.replace("&lt;", "<")
.replace("&gt;", ">")
for document_information in filtered_annotations
]
with open(tmp_filename, "w") as f:
f.write("\n".join(anonymized_doc))

# Add watermark to the end of the document
f.write(
"\n\nDocumento anonimizado por AymurAI\n\nhttps://www.aymurai.info/"
)

# Convert to ODT
# DOCX flow keeps ODT output
cmd = [
settings.LIBREOFFICE_BIN,
"--headless",
"--convert-to",
"odt",
"--outdir",
tmp_dir,
tmp_filename,
anonymized_path,
]

logger.info(f"Executing: {' '.join(cmd)}")

try:
output = subprocess.check_output(
cmd, shell=False, encoding="utf-8", errors="ignore"
)
logger.info(f"LibreOffice output: {output}")
except subprocess.CalledProcessError as e:
except subprocess.CalledProcessError as exc:
raise RuntimeError(
f"LibreOffice conversion failed: {e.output.decode('utf-8', errors='ignore')}"
)
f"LibreOffice conversion failed: {exc.output.decode('utf-8', errors='ignore')}"
) from exc
finally:
if os.path.exists(tmp_filename):
os.remove(tmp_filename)

odt = tmp_filename.replace(suffix, ".odt")
odt = f"{os.path.splitext(anonymized_path)[0]}.odt"
logger.info(f"Expected output file path: {odt}")

if not os.path.exists(odt):
raise RuntimeError(f"File at path {odt} does not exist.")

# Ensure the temporary file is deleted
os.remove(tmp_filename)

return FileResponse(
odt,
background=BackgroundTask(os.remove, odt),
Expand Down
21 changes: 16 additions & 5 deletions aymurai/api/endpoints/routers/misc/document_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def extraction(path: str) -> str:
str: Extracted text from the document.
"""
text = extract_document(path)
return document_normalize(text) if text else ""
return document_normalize(text, preserve_paragraphs=True) if text else ""


def run_safe_text_extraction(
Expand Down Expand Up @@ -63,6 +63,20 @@ def run_safe_text_extraction(
raise


def _split_document_paragraphs(document: str) -> list[str]:
if re.search(r"\n\s*\n+", document):
raw_paragraphs = re.split(r"\n\s*\n+", document)
else:
raw_paragraphs = document.splitlines()

paragraphs = [
re.sub(r"[ \t]{2,}", " ", paragraph.strip())
for paragraph in raw_paragraphs
if paragraph.strip()
]
return list(unique_justseen(paragraphs))


@router.post("/document-extract", response_model=Document)
def plain_text_extractor(file: UploadFile) -> Document:
"""
Expand Down Expand Up @@ -111,9 +125,6 @@ def plain_text_extractor(file: UploadFile) -> Document:
logger.info(f"removed temp file from local storage => {tmp_filename}")

document_id = data_to_uuid(data)

paragraphs = [line.strip() for line in document.split("\n") if line.strip()]
paragraphs = [re.sub(r"\s{2,}", " ", line) for line in paragraphs]
paragraphs = list(unique_justseen(paragraphs))
paragraphs = _split_document_paragraphs(document)

return Document(document=paragraphs, document_id=document_id)
8 changes: 4 additions & 4 deletions aymurai/database/crud/anonymization/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def _serialize_doclabels(value: list[DocLabel] | None):
"""
if value is None:
return None
return _DOC_LABELS_ADAPTER.dump_python(value, mode="json")
return _DOC_LABELS_ADAPTER.dump_python(value, mode="json", exclude_none=True)


def _normalize_paragraph_payload(payload: dict) -> dict:
Expand Down Expand Up @@ -63,7 +63,7 @@ def anonymization_paragraph_create(
Returns:
AnonymizationParagraph: The persisted paragraph record.
"""
payload = _normalize_paragraph_payload(paragraph_in.model_dump())
payload = _normalize_paragraph_payload(paragraph_in.model_dump(exclude_none=True))
new_paragraph = AnonymizationParagraph(**payload)

if override:
Expand Down Expand Up @@ -171,14 +171,14 @@ def anonymization_paragraph_batch_create_update(

paragraph = session.get(AnonymizationParagraph, paragraph_id)
if paragraph:
payload = _normalize_paragraph_payload(p_in.model_dump())
payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
payload.pop("id", None)
for field, value in payload.items():
if value is not None:
setattr(paragraph, field, value)

else:
payload = _normalize_paragraph_payload(p_in.model_dump())
payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
paragraph = AnonymizationParagraph(**payload)

session.add(paragraph)
Expand Down
4 changes: 4 additions & 0 deletions aymurai/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def assemble_cors_origins(cls, v) -> list[str]:
MEMORY_CACHE_TTL: int = 60

LIBREOFFICE_BIN: str = "libreoffice"
PDF_WATERMARK_FONT_REGULAR: str | None = None
PDF_WATERMARK_FONT_BOLD: str | None = None
ANONYMIZATION_METADATA_CREATOR: str = "AymurAI"
ANONYMIZATION_METADATA_PRODUCER: str = "AymurAI"

# Disambiguation Config

Expand Down
18 changes: 16 additions & 2 deletions aymurai/text/anonymization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
from aymurai.text.anonymization.alignment import replace_labels_in_text
from aymurai.text.anonymization.doc_anonymizer import DocAnonymizer
from aymurai.text.anonymization.base import (
BaseAnonymizer,
InvalidDocumentAnonymizer,
get_anonymizer,
register_anonymizer,
supported_extensions,
)
from aymurai.text.anonymization.docx import DocxAnonymizer
from aymurai.text.anonymization.pdf import PdfAnonymizer

__all__ = [
"DocAnonymizer",
"BaseAnonymizer",
"DocxAnonymizer",
"PdfAnonymizer",
"InvalidDocumentAnonymizer",
"get_anonymizer",
"register_anonymizer",
"supported_extensions",
"replace_labels_in_text",
]
Loading
Loading