AymurAI · jansaldo · Apr 20, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
@@ -5,7 +5,7 @@
 from threading import Lock
 
 import torch
-from fastapi import Body, Depends, Form, Query, UploadFile
+from fastapi import Body, Depends, Form, HTTPException, Query, UploadFile
 from fastapi.responses import FileResponse
 from fastapi.routing import APIRouter
 from sqlmodel import Session
@@ -31,7 +31,10 @@
     TextRequest,
 )
 from aymurai.settings import settings
-from aymurai.text.anonymization import DocAnonymizer, replace_labels_in_text
+from aymurai.text.anonymization import (
+    InvalidDocumentAnonymizer,
+    get_anonymizer,
+)
 from aymurai.text.extraction import MIMETYPE_EXTENSION_MAPPER
 from aymurai.utils.entity_disambiguation import (
     build_canonical_entities,
@@ -514,11 +517,21 @@ async def anonymizer_compile_document(
     """
     logger.info(f"receiving => {file.filename}")
     extension = MIMETYPE_EXTENSION_MAPPER.get(file.content_type)
-    logger.info(f"detection extension: {extension} ({file.content_type})")
+    file_suffix = os.path.splitext(file.filename or "")[1].lower()
+
+    if extension is None and file_suffix:
+        extension = file_suffix.lstrip(".")
+
+    if extension not in {"docx", "pdf"}:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format for anonymization: {extension or 'unknown'}",
+        )
+
+    logger.info(f"detected extension: {extension} ({file.content_type})")
 
     # Create a temporary file
-    _, suffix = os.path.splitext(file.filename)
-    suffix = suffix if suffix == ".docx" else ".txt"
+    suffix = f".{extension}"
     tmp_dir = tempfile.gettempdir()
 
     # Use delete=False to avoid the file being deleted when the NamedTemporaryFile object is closed
@@ -537,7 +550,7 @@ async def anonymizer_compile_document(
 
     annots_json = json.loads(annotations)
     annots = DocumentAnnotations.model_validate(annots_json)
-    logger.info(f"processing annotations => {annots}")
+
     effective_label_policies = _merge_label_policies(annots.label_policies)
     effective_render_policy = _merge_render_policy(annots.render_policy)
 
@@ -562,9 +575,6 @@ async def anonymizer_compile_document(
         override=False,
     )
 
-    # Anonymize the document
-    doc_anonymizer = DocAnonymizer()
-
     filtered_annotations = []
     for paragraph in annots.data:
         filtered_labels = [
@@ -583,70 +593,66 @@ async def anonymizer_compile_document(
         filtered_annotations, effective_render_policy, effective_label_policies
     )
 
-    if suffix == ".docx":
-        item = {"path": tmp_filename}
-        doc_anonymizer.render_context = render_context
-        doc_anonymizer(
-            item,
-            [
-                document_information.model_dump()
-                for document_information in filtered_annotations
-            ],
+    preds = [
+        document_information.model_dump(mode="json", exclude_none=True)
+        for document_information in filtered_annotations
+    ]
+
+    try:
+        anonymizer = get_anonymizer(extension)
+        anonymized_path = anonymizer(
+            {"path": tmp_filename},
+            preds,
             tmp_dir,
+            render_context=render_context,
+        )
+    except (ValueError, InvalidDocumentAnonymizer) as exc:
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    if extension == "pdf":
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
+
+        return FileResponse(
+            anonymized_path,
+            background=BackgroundTask(os.remove, anonymized_path),
+            media_type="application/pdf",
+            filename=f"{os.path.splitext(file.filename)[0]}.pdf",
         )
-        logger.info(f"saved temp file on local storage => {tmp_filename}")
-
-    else:
-        # Export as raw document
-        anonymized_doc = [
-            replace_labels_in_text(
-                document_information.model_dump(),
-                render_context=render_context,
-            )
-            .replace("&lt;", "<")
-            .replace("&gt;", ">")
-            for document_information in filtered_annotations
-        ]
-        with open(tmp_filename, "w") as f:
-            f.write("\n".join(anonymized_doc))
-
-            # Add watermark to the end of the document
-            f.write(
-                "\n\nDocumento anonimizado por AymurAI\n\nhttps://www.aymurai.info/"
-            )
 
-    # Convert to ODT
+    # DOCX flow keeps ODT output
     cmd = [
         settings.LIBREOFFICE_BIN,
         "--headless",
         "--convert-to",
         "odt",
         "--outdir",
         tmp_dir,
-        tmp_filename,
+        anonymized_path,
     ]
-
     logger.info(f"Executing: {' '.join(cmd)}")
 
     try:
         output = subprocess.check_output(
             cmd, shell=False, encoding="utf-8", errors="ignore"
         )
         logger.info(f"LibreOffice output: {output}")
-    except subprocess.CalledProcessError as e:
+    except subprocess.CalledProcessError as exc:
         raise RuntimeError(
-            f"LibreOffice conversion failed: {e.output.decode('utf-8', errors='ignore')}"
-        )
+            f"LibreOffice conversion failed: {exc.output.decode('utf-8', errors='ignore')}"
+        ) from exc
+    finally:
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
 
-    odt = tmp_filename.replace(suffix, ".odt")
+    odt = f"{os.path.splitext(anonymized_path)[0]}.odt"
     logger.info(f"Expected output file path: {odt}")
 
     if not os.path.exists(odt):
         raise RuntimeError(f"File at path {odt} does not exist.")
 
-    # Ensure the temporary file is deleted
-    os.remove(tmp_filename)
-
     return FileResponse(
         odt,
         background=BackgroundTask(os.remove, odt),

diff --git a/aymurai/api/endpoints/routers/misc/document_extract.py b/aymurai/api/endpoints/routers/misc/document_extract.py
@@ -31,7 +31,7 @@ def extraction(path: str) -> str:
         str: Extracted text from the document.
     """
     text = extract_document(path)
-    return document_normalize(text) if text else ""
+    return document_normalize(text, preserve_paragraphs=True) if text else ""
 
 
 def run_safe_text_extraction(
@@ -112,8 +112,12 @@ def plain_text_extractor(file: UploadFile) -> Document:
 
     document_id = data_to_uuid(data)
 
-    paragraphs = [line.strip() for line in document.split("\n") if line.strip()]
-    paragraphs = [re.sub(r"\s{2,}", " ", line) for line in paragraphs]
+    paragraphs = [
+        paragraph.strip()
+        for paragraph in re.split(r"\n\s*\n+", document)
+        if paragraph.strip()
+    ]
+    paragraphs = [re.sub(r"[ \t]{2,}", " ", paragraph) for paragraph in paragraphs]
     paragraphs = list(unique_justseen(paragraphs))
 
     return Document(document=paragraphs, document_id=document_id)
diff --git a/aymurai/database/crud/anonymization/paragraph.py b/aymurai/database/crud/anonymization/paragraph.py
@@ -27,7 +27,7 @@ def _serialize_doclabels(value: list[DocLabel] | None):
     """
     if value is None:
         return None
-    return _DOC_LABELS_ADAPTER.dump_python(value, mode="json")
+    return _DOC_LABELS_ADAPTER.dump_python(value, mode="json", exclude_none=True)
 
 
 def _normalize_paragraph_payload(payload: dict) -> dict:
@@ -63,7 +63,7 @@ def anonymization_paragraph_create(
     Returns:
         AnonymizationParagraph: The persisted paragraph record.
     """
-    payload = _normalize_paragraph_payload(paragraph_in.model_dump())
+    payload = _normalize_paragraph_payload(paragraph_in.model_dump(exclude_none=True))
     new_paragraph = AnonymizationParagraph(**payload)
 
     if override:
@@ -171,14 +171,14 @@ def anonymization_paragraph_batch_create_update(
 
         paragraph = session.get(AnonymizationParagraph, paragraph_id)
         if paragraph:
-            payload = _normalize_paragraph_payload(p_in.model_dump())
+            payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
             payload.pop("id", None)
             for field, value in payload.items():
                 if value is not None:
                     setattr(paragraph, field, value)
 
         else:
-            payload = _normalize_paragraph_payload(p_in.model_dump())
+            payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
             paragraph = AnonymizationParagraph(**payload)
 
         session.add(paragraph)

diff --git a/aymurai/text/anonymization/__init__.py b/aymurai/text/anonymization/__init__.py
@@ -1,7 +1,21 @@
 from aymurai.text.anonymization.alignment import replace_labels_in_text
-from aymurai.text.anonymization.doc_anonymizer import DocAnonymizer
+from aymurai.text.anonymization.base import (
+    BaseAnonymizer,
+    InvalidDocumentAnonymizer,
+    get_anonymizer,
+    register_anonymizer,
+    supported_extensions,
+)
+from aymurai.text.anonymization.docx import DocxAnonymizer
+from aymurai.text.anonymization.pdf import PdfAnonymizer
 
 __all__ = [
-    "DocAnonymizer",
+    "BaseAnonymizer",
+    "DocxAnonymizer",
+    "PdfAnonymizer",
+    "InvalidDocumentAnonymizer",
+    "get_anonymizer",
+    "register_anonymizer",
+    "supported_extensions",
     "replace_labels_in_text",
 ]
diff --git a/aymurai/text/anonymization/alignment.py b/aymurai/text/anonymization/alignment.py
@@ -9,9 +9,9 @@
 from joblib import hash
 from more_itertools import flatten
 
+from aymurai.meta.api_interfaces import LabelPolicy
 from aymurai.models.flair.utils import FlairTextNormalize
 from aymurai.utils.alignment.core import align_text, tokenize
-from aymurai.meta.api_interfaces import LabelPolicy
 
 REGEX_PARAGRAPH = r"((?<!\/)w:p\b)(?P<paragraph>.*?)(\/w:p\b)"
 REGEX_FRAGMENT = r"(?<!\/)w:t\b.*?>(?P<text>.*?)(<.*?\/w:t)"
@@ -61,6 +61,72 @@ def resolve_render_token(label: dict, render_context: dict | None = None) -> str
     return f"{base}_{index}"
 
 
+def _label_replacement_start(label: dict) -> int:
+    """
+    Determines the start character index for a label, considering possible alternative attributes.
+
+    Args:
+        label (dict): Label dictionary which may contain alternative start character attributes.
+
+    Returns:
+        int: The start character index for the label.
+    """
+    attrs = label.get("attrs") or {}
+    alt_start = attrs.get("aymurai_alt_start_char")
+    start_char = label.get("start_char")
+    return int(alt_start if alt_start is not None else (start_char or 0))
+
+
+def _label_replacement_end(label: dict) -> int:
+    """
+    Determines the end character index for a label, considering possible alternative attributes.
+
+    Args:
+        label (dict): Label dictionary which may contain alternative end character attributes.
+
+    Returns:
+        int: The end character index for the label.
+    """
+    attrs = label.get("attrs") or {}
+    alt_end = attrs.get("aymurai_alt_end_char")
+    end_char = label.get("end_char")
+    return int(alt_end if alt_end is not None else (end_char or 0))
+
+
+def _label_replacement_text(label: dict, document: str) -> str:
+    """
+    Determines the replacement text for a label, considering possible alternative attributes.
+
+    Args:
+        label (dict): Label dictionary which may contain alternative text attributes.
+        document (str): The document text from which to extract the label text.
+
+    Returns:
+        str: The text for the label, considering possible alternative attributes.
+    """
+    attrs = label.get("attrs") or {}
+
+    alt_text = attrs.get("aymurai_alt_text")
+    if alt_text is not None:
+        return str(alt_text) if alt_text else ""
+
+    alt_start = attrs.get("aymurai_alt_start_char")
+    alt_end = attrs.get("aymurai_alt_end_char")
+    if alt_start is not None and alt_end is not None:
+        start_char, end_char = int(alt_start), int(alt_end)
+        if 0 <= start_char < end_char <= len(document):
+            return document[start_char:end_char]
+        return ""
+
+    start_char = int(label.get("start_char") or 0)
+    end_char = int(label.get("end_char") or 0)
+    if 0 <= start_char < end_char <= len(document):
+        return document[start_char:end_char]
+
+    text = label.get("text")
+    return str(text) if text else ""
+
+
 def unify_consecutive_labels(
     sample: dict,
     text_key: str = "document",
@@ -93,9 +159,11 @@ def unify_consecutive_labels(
     # Iterate over labels
     for label in labels:
         # Get attributes
-        text = label["attrs"]["aymurai_alt_text"] or label["text"]
-        start_char = label["attrs"]["aymurai_alt_start_char"] or label["start_char"]
-        end_char = label["attrs"]["aymurai_alt_end_char"] or label["end_char"]
+        text = _label_replacement_text(label, document)
+        start_char = _label_replacement_start(label)
+        end_char = _label_replacement_end(label)
+        if not text or end_char <= start_char:
+            continue
         aymurai_label = resolve_render_token(label, render_context)
 
         if current_group is None:
@@ -115,7 +183,7 @@ def unify_consecutive_labels(
         else:
             # Finish the current group and start a new one
             current_group["text"] = document[
-                current_group["start_char"] : current_group["end_char"] + 1
+                current_group["start_char"] : current_group["end_char"]
             ]
             unified_labels.append(current_group)
             current_group = {
@@ -128,7 +196,7 @@ def unify_consecutive_labels(
     # Finish the last group
     if current_group is not None:
         current_group["text"] = document[
-            current_group["start_char"] : current_group["end_char"] + 1
+            current_group["start_char"] : current_group["end_char"]
         ]
         unified_labels.append(current_group)