AymurAI · jansaldo · Apr 20, 2026 · Mar 17, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -64,7 +64,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          uv sync --frozen --python python --no-dev --no-managed-python --group tests
+          uv sync --frozen --python python --no-dev --no-python-downloads --group tests
 
       - name: Run api tests
         env:

diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
@@ -5,7 +5,7 @@
 from threading import Lock
 
 import torch
-from fastapi import Body, Depends, Form, Query, UploadFile
+from fastapi import Body, Depends, Form, HTTPException, Query, UploadFile
 from fastapi.responses import FileResponse
 from fastapi.routing import APIRouter
 from sqlmodel import Session
@@ -31,7 +31,10 @@
     TextRequest,
 )
 from aymurai.settings import settings
-from aymurai.text.anonymization import DocAnonymizer, replace_labels_in_text
+from aymurai.text.anonymization import (
+    InvalidDocumentAnonymizer,
+    get_anonymizer,
+)
 from aymurai.text.extraction import MIMETYPE_EXTENSION_MAPPER
 from aymurai.utils.entity_disambiguation import (
     build_canonical_entities,
@@ -514,11 +517,21 @@ async def anonymizer_compile_document(
     """
     logger.info(f"receiving => {file.filename}")
     extension = MIMETYPE_EXTENSION_MAPPER.get(file.content_type)
-    logger.info(f"detection extension: {extension} ({file.content_type})")
+    file_suffix = os.path.splitext(file.filename or "")[1].lower()
+
+    if extension is None and file_suffix:
+        extension = file_suffix.lstrip(".")
+
+    if extension not in {"docx", "pdf"}:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported format for anonymization: {extension or 'unknown'}",
+        )
+
+    logger.info(f"detected extension: {extension} ({file.content_type})")
 
     # Create a temporary file
-    _, suffix = os.path.splitext(file.filename)
-    suffix = suffix if suffix == ".docx" else ".txt"
+    suffix = f".{extension}"
     tmp_dir = tempfile.gettempdir()
 
     # Use delete=False to avoid the file being deleted when the NamedTemporaryFile object is closed
@@ -537,7 +550,7 @@ async def anonymizer_compile_document(
 
     annots_json = json.loads(annotations)
     annots = DocumentAnnotations.model_validate(annots_json)
-    logger.info(f"processing annotations => {annots}")
+
     effective_label_policies = _merge_label_policies(annots.label_policies)
     effective_render_policy = _merge_render_policy(annots.render_policy)
 
@@ -562,9 +575,6 @@ async def anonymizer_compile_document(
         override=False,
     )
 
-    # Anonymize the document
-    doc_anonymizer = DocAnonymizer()
-
     filtered_annotations = []
     for paragraph in annots.data:
         filtered_labels = [
@@ -583,70 +593,66 @@ async def anonymizer_compile_document(
         filtered_annotations, effective_render_policy, effective_label_policies
     )
 
-    if suffix == ".docx":
-        item = {"path": tmp_filename}
-        doc_anonymizer.render_context = render_context
-        doc_anonymizer(
-            item,
-            [
-                document_information.model_dump()
-                for document_information in filtered_annotations
-            ],
+    preds = [
+        document_information.model_dump(mode="json", exclude_none=True)
+        for document_information in filtered_annotations
+    ]
+
+    try:
+        anonymizer = get_anonymizer(extension)
+        anonymized_path = anonymizer(
+            {"path": tmp_filename},
+            preds,
             tmp_dir,
+            render_context=render_context,
+        )
+    except (ValueError, InvalidDocumentAnonymizer) as exc:
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    if extension == "pdf":
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
+
+        return FileResponse(
+            anonymized_path,
+            background=BackgroundTask(os.remove, anonymized_path),
+            media_type="application/pdf",
+            filename=f"{os.path.splitext(file.filename)[0]}.pdf",
         )
-        logger.info(f"saved temp file on local storage => {tmp_filename}")
-
-    else:
-        # Export as raw document
-        anonymized_doc = [
-            replace_labels_in_text(
-                document_information.model_dump(),
-                render_context=render_context,
-            )
-            .replace("&lt;", "<")
-            .replace("&gt;", ">")
-            for document_information in filtered_annotations
-        ]
-        with open(tmp_filename, "w") as f:
-            f.write("\n".join(anonymized_doc))
-
-            # Add watermark to the end of the document
-            f.write(
-                "\n\nDocumento anonimizado por AymurAI\n\nhttps://www.aymurai.info/"
-            )
 
-    # Convert to ODT
+    # DOCX flow keeps ODT output
     cmd = [
         settings.LIBREOFFICE_BIN,
         "--headless",
         "--convert-to",
         "odt",
         "--outdir",
         tmp_dir,
-        tmp_filename,
+        anonymized_path,
     ]
-
     logger.info(f"Executing: {' '.join(cmd)}")
 
     try:
         output = subprocess.check_output(
             cmd, shell=False, encoding="utf-8", errors="ignore"
         )
         logger.info(f"LibreOffice output: {output}")
-    except subprocess.CalledProcessError as e:
+    except subprocess.CalledProcessError as exc:
         raise RuntimeError(
-            f"LibreOffice conversion failed: {e.output.decode('utf-8', errors='ignore')}"
-        )
+            f"LibreOffice conversion failed: {exc.output.decode('utf-8', errors='ignore')}"
+        ) from exc
+    finally:
+        if os.path.exists(tmp_filename):
+            os.remove(tmp_filename)
 
-    odt = tmp_filename.replace(suffix, ".odt")
+    odt = f"{os.path.splitext(anonymized_path)[0]}.odt"
     logger.info(f"Expected output file path: {odt}")
 
     if not os.path.exists(odt):
         raise RuntimeError(f"File at path {odt} does not exist.")
 
-    # Ensure the temporary file is deleted
-    os.remove(tmp_filename)
-
     return FileResponse(
         odt,
         background=BackgroundTask(os.remove, odt),

diff --git a/aymurai/api/endpoints/routers/misc/document_extract.py b/aymurai/api/endpoints/routers/misc/document_extract.py
@@ -31,7 +31,7 @@ def extraction(path: str) -> str:
         str: Extracted text from the document.
     """
     text = extract_document(path)
-    return document_normalize(text) if text else ""
+    return document_normalize(text, preserve_paragraphs=True) if text else ""
 
 
 def run_safe_text_extraction(
@@ -63,6 +63,20 @@ def run_safe_text_extraction(
             raise
 
 
+def _split_document_paragraphs(document: str) -> list[str]:
+    if re.search(r"\n\s*\n+", document):
+        raw_paragraphs = re.split(r"\n\s*\n+", document)
+    else:
+        raw_paragraphs = document.splitlines()
+
+    paragraphs = [
+        re.sub(r"[ \t]{2,}", " ", paragraph.strip())
+        for paragraph in raw_paragraphs
+        if paragraph.strip()
+    ]
+    return list(unique_justseen(paragraphs))
+
+
 @router.post("/document-extract", response_model=Document)
 def plain_text_extractor(file: UploadFile) -> Document:
     """
@@ -111,9 +125,6 @@ def plain_text_extractor(file: UploadFile) -> Document:
     logger.info(f"removed temp file from local storage => {tmp_filename}")
 
     document_id = data_to_uuid(data)
-
-    paragraphs = [line.strip() for line in document.split("\n") if line.strip()]
-    paragraphs = [re.sub(r"\s{2,}", " ", line) for line in paragraphs]
-    paragraphs = list(unique_justseen(paragraphs))
+    paragraphs = _split_document_paragraphs(document)
 
     return Document(document=paragraphs, document_id=document_id)
diff --git a/aymurai/database/crud/anonymization/paragraph.py b/aymurai/database/crud/anonymization/paragraph.py
@@ -27,7 +27,7 @@ def _serialize_doclabels(value: list[DocLabel] | None):
     """
     if value is None:
         return None
-    return _DOC_LABELS_ADAPTER.dump_python(value, mode="json")
+    return _DOC_LABELS_ADAPTER.dump_python(value, mode="json", exclude_none=True)
 
 
 def _normalize_paragraph_payload(payload: dict) -> dict:
@@ -63,7 +63,7 @@ def anonymization_paragraph_create(
     Returns:
         AnonymizationParagraph: The persisted paragraph record.
     """
-    payload = _normalize_paragraph_payload(paragraph_in.model_dump())
+    payload = _normalize_paragraph_payload(paragraph_in.model_dump(exclude_none=True))
     new_paragraph = AnonymizationParagraph(**payload)
 
     if override:
@@ -171,14 +171,14 @@ def anonymization_paragraph_batch_create_update(
 
         paragraph = session.get(AnonymizationParagraph, paragraph_id)
         if paragraph:
-            payload = _normalize_paragraph_payload(p_in.model_dump())
+            payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
             payload.pop("id", None)
             for field, value in payload.items():
                 if value is not None:
                     setattr(paragraph, field, value)
 
         else:
-            payload = _normalize_paragraph_payload(p_in.model_dump())
+            payload = _normalize_paragraph_payload(p_in.model_dump(exclude_none=True))
             paragraph = AnonymizationParagraph(**payload)
 
         session.add(paragraph)

diff --git a/aymurai/settings.py b/aymurai/settings.py
@@ -65,6 +65,10 @@ def assemble_cors_origins(cls, v) -> list[str]:
     MEMORY_CACHE_TTL: int = 60
 
     LIBREOFFICE_BIN: str = "libreoffice"
+    PDF_WATERMARK_FONT_REGULAR: str | None = None
+    PDF_WATERMARK_FONT_BOLD: str | None = None
+    ANONYMIZATION_METADATA_CREATOR: str = "AymurAI"
+    ANONYMIZATION_METADATA_PRODUCER: str = "AymurAI"
 
     # Disambiguation Config
 

diff --git a/aymurai/text/anonymization/__init__.py b/aymurai/text/anonymization/__init__.py
@@ -1,7 +1,21 @@
 from aymurai.text.anonymization.alignment import replace_labels_in_text
-from aymurai.text.anonymization.doc_anonymizer import DocAnonymizer
+from aymurai.text.anonymization.base import (
+    BaseAnonymizer,
+    InvalidDocumentAnonymizer,
+    get_anonymizer,
+    register_anonymizer,
+    supported_extensions,
+)
+from aymurai.text.anonymization.docx import DocxAnonymizer
+from aymurai.text.anonymization.pdf import PdfAnonymizer
 
 __all__ = [
-    "DocAnonymizer",
+    "BaseAnonymizer",
+    "DocxAnonymizer",
+    "PdfAnonymizer",
+    "InvalidDocumentAnonymizer",
+    "get_anonymizer",
+    "register_anonymizer",
+    "supported_extensions",
     "replace_labels_in_text",
 ]