Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
77ddef7
➕ build(deps): Add langextract for text entity extraction
jansaldo Aug 22, 2025
0bbf0d2
🚧 wip: Add langextract entity extraction experiment notebook
jansaldo Aug 22, 2025
71cbd62
Merge branch 'dev' of https://github.com/AymurAI/backend into feature…
jansaldo Nov 10, 2025
c2bc1f2
✨ feat: Enhance entity models with relation handling and canonical re…
jansaldo Nov 10, 2025
d19bb79
✨ feat: Add JSON serialization support and enhance utility functions
jansaldo Nov 10, 2025
1c0edb5
⬆️ Upgrade ML dependencies and refresh uv.lock
jansaldo Nov 10, 2025
fe35a4e
🚧 wip: Update extraction examples in langextract notebook
jansaldo Nov 10, 2025
25070e2
Merge branch 'dev' of https://github.com/AymurAI/backend into feature…
jansaldo Nov 11, 2025
3d3d230
📝 Add entity disambiguation notebook for canonical entity extraction
jansaldo Nov 14, 2025
dabc47f
Merge branch 'release/v2.0.0' of github.com:AymurAI/backend into feat…
jansaldo Nov 18, 2025
444194b
⬆️ Update dependencies: langextract to 1.1.0 and ollama to 0.6.1; add…
jansaldo Nov 18, 2025
8b13aad
📝 Integrate custom OpenAI model for extraction and remove failing em…
jansaldo Nov 18, 2025
68eae78
📝 Update error message format in json_serial function for better read…
jansaldo Nov 18, 2025
4517601
♻️ Inline immediate return in get_pretty
jansaldo Nov 18, 2025
e96d8e4
🐛 Fix: Use json_serial in save_json
jansaldo Nov 18, 2025
c45a863
🎨 Format json.dumps call in save_json for improved readability
jansaldo Nov 18, 2025
6100447
Merge pull request #58 from AymurAI/feature/langextract
jansaldo Nov 18, 2025
401a08b
Feature/ollama service (#59)
jansaldo Nov 19, 2025
29d8328
Feature/llm providers (#60)
jansaldo Nov 20, 2025
d80f74b
Feature/disambiguation metric v2 (#62)
padonizetti Nov 26, 2025
3f04031
Feature/summarization (#61)
jansaldo Nov 26, 2025
60bf959
🩹 Fix YAML key names in prompt defaults for summarization
jansaldo Nov 27, 2025
f042089
♻️ refactor: Restructure USEM module with factory pattern and multipl…
lionelchamorro Dec 5, 2025
f85d7cf
⏪ Rollback to previous torch and torchtext versions to avoid conflicts
jansaldo Dec 5, 2025
5ac37a0
🩹 Fix: Add missing environment variable for OLLAMA_HOST in docker-com…
jansaldo Dec 5, 2025
c1a8a9a
📝 Add anonymization pipeline docs
jansaldo Dec 5, 2025
29b6082
🚧 WIP: Add Playwright PJN scraper
jansaldo Dec 5, 2025
0dbc86d
📝 Add Jupyter notebook for entity disambiguation from pre-clustered v…
jansaldo Dec 10, 2025
26033a8
Feature/pdf extraction upgrade (#65)
jansaldo Dec 17, 2025
2ec306b
Feature/remove usem tensorflow deps (#68)
jansaldo Jan 5, 2026
0a35efe
WIP: feat(decision): ✨ integrate TinyEmbeddingBagClassifier for decis…
jedzill4 Jan 5, 2026
90f7369
🔥 Remove TensorFlow environment variables
jansaldo Jan 7, 2026
0070916
Feature/mlfow integration (#66)
padonizetti Jan 8, 2026
d349c69
Feature/document extract config (#69)
jansaldo Jan 9, 2026
b6829af
Feature/pre disambiguation optimization (#70)
jansaldo Jan 13, 2026
4f844a2
Endpoint /disambiguate with LLM Inference (#72)
conrabeatriz Feb 10, 2026
b343746
Hotfix: resolve file pathing, logic indentation, and date disambiguation
conrabeatriz Feb 10, 2026
bdec740
Feature/anonymize document refactor (#73)
jansaldo Feb 13, 2026
96a88ef
⏪ Revert entrypoint.sh to 1ac2776
jansaldo Jan 30, 2025
20f9e92
⏪ Revert .dockerignore to 5af5814
conrabeatriz Feb 18, 2026
04f3343
⏪ Revert .env.common to 90f7369
conrabeatriz Feb 18, 2026
1dbd24c
⏪ Revert .vscode/launch.json to f366690
conrabeatriz Feb 18, 2026
f645881
⏪ Revert Makefile to cb3df05
conrabeatriz Feb 18, 2026
8983156
⏪ Revert aymurai/api.core.py to 19a9ca8
conrabeatriz Feb 18, 2026
8ca5420
🦖 Changed aymurai/api/endpoints/routers/anonymizer/anonymizer.py for …
conrabeatriz Feb 18, 2026
6933321
🔥 Removed aymurai/api/endpoints/routers/llm for release/v1.5.0 compat…
conrabeatriz Feb 18, 2026
3c55d8e
🦖 Changed aymurai/api/endpoints/routers/misc/document_extract.py for …
conrabeatriz Feb 19, 2026
60ba414
⏪ Revert aymurai/api/main.py to a801bf4
conrabeatriz Feb 19, 2026
596d3b1
🔥 Removed aymurai/api/startup/marker.py for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
994aca4
🔥 aymurai/experiments/entity_disambiguation folder for release/v1.5.…
conrabeatriz Feb 19, 2026
55de086
🔥 Removed aymurai/llm_providers for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
0c478a9
🦖 Changed aymurai/settings.py for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
0990214
🦖 Changed aymurai/api/endpoints/routers/anonymizer/anonymizer.py for …
conrabeatriz Feb 19, 2026
b05b768
⏪ Reverted docker-compose.yml to 5b9c220
conrabeatriz Feb 19, 2026
ebe414b
⏪ Revert docker/api/Dockerfile to 4196117
conrabeatriz Feb 19, 2026
d312eea
🦖 Changed docs/anonymization/README.md for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
dde8f0e
🔥 Removed docs/experiments/README.md for realease/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
d560075
🔥 Removed notebooks/experiments/anonymization/05-langextract.ipynb fo…
conrabeatriz Feb 19, 2026
d35c6cb
🔥 Removed all the notebooks from folder: notebooks/experiments/entit…
conrabeatriz Feb 19, 2026
69e90d7
🔥 Removed notebooks/experiments/llm-providers for release/v1.5.0 comp…
conrabeatriz Feb 19, 2026
aa88866
🔥 Removed notebooks/experiments/summarization for release/v1.5.0 comp…
conrabeatriz Feb 19, 2026
e333f4c
🦖 Changed pyproject.toml for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
f735c1b
🔥 Removed resources/llm for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
568496e
🔥 Removed summarization_app for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
0d4e78e
🔥 Removed test/llm_providers for release/v1.5.0 compatibility
conrabeatriz Feb 19, 2026
6fd575a
🐛 Bug fixed in pyproject.toml line 106 for .venv build up
conrabeatriz Feb 20, 2026
ef1b7f5
🐛 Bug fixed in function '_normalize_text' from 'aymurai.text.extracto…
conrabeatriz Feb 20, 2026
90db891
⏪ Revert elimination of folder aymurai/experiments/entity_disambiguat…
conrabeatriz Feb 20, 2026
9afde50
🔥 Removed aymurai/experiments/entity_disambiguation for release/v1.5.…
conrabeatriz Feb 20, 2026
c9e1484
🐛 Bug fixed in experiments/entity-disambiguation/10-anonymize-documen…
conrabeatriz Feb 20, 2026
d02e20f
🔥 Removed TESSDATA_PREFIX from .env.common
jansaldo Feb 20, 2026
eef81c6
🙈 Update .gitignore to include notebooks directory while excluding su…
jansaldo Feb 20, 2026
7f8ebfa
🔀 Synthesize docker-compose from 26033a8f/00709164 after b05b768 roll…
jansaldo Feb 20, 2026
b0233e5
🔀 Synthesize Makefile from afbfda9/d80f74b/26033a8f after f645881 rol…
jansaldo Feb 20, 2026
ca04d8b
🔧 Fix repository URL case sensitivity in pyproject.toml and remove un…
jansaldo Feb 20, 2026
203f33e
🔥 Remove tasks.json configuration for Ollama service
jansaldo Feb 20, 2026
24825e1
🔥 Remove scraper and documentation
jansaldo Feb 20, 2026
a6986fe
🔥 Remove experiment module
jansaldo Feb 20, 2026
968344d
🔥 Remove path utility functions from paths.py
jansaldo Feb 20, 2026
6c35143
🔥 Remove unused PromptSet and PromptLibrary classes, and simplify dis…
jansaldo Feb 20, 2026
4d28e03
🔥 Remove EntityRelation class and its associated methods from entitie…
jansaldo Feb 20, 2026
e6f32ba
📝 Enhance documentation with detailed docstrings for various function…
jansaldo Feb 20, 2026
4559764
🔥 Removed PromptLibrary class from aymurai/api/endpoints/routers/anon…
conrabeatriz Feb 20, 2026
35412a4
🎨 Changed map_canonical_entities_ner_preds function in aymurai/utils/…
conrabeatriz Feb 20, 2026
014b28e
🔀 Synthesize document_extract from d349c69 after 3c55d8e: remove extr…
jansaldo Feb 20, 2026
91d2c10
🔀 Synthesize PDF extraction flow from d349c69/26033a8: remove cache/d…
jansaldo Feb 20, 2026
000215e
🔥 Remove text extraction tests
jansaldo Feb 20, 2026
9e68af6
Merge branch 'release/v1.5.0' of https://github.com/AymurAI/backend i…
jansaldo Feb 20, 2026
0d3aae5
📝 Update description formatting for aymurai_disambiguation field in E…
jansaldo Feb 20, 2026
efecf50
🦖 Update PdfExtractor.extract method to include ignored keyword argum…
jansaldo Feb 20, 2026
837c639
🔥 Remove unused static logo file from API resources
jansaldo Feb 23, 2026
846ae17
🔧 Add version_scheme configuration to setuptools_scm in pyproject.toml
jansaldo Feb 23, 2026
63e96ea
📌 Update uv.lock
jansaldo Feb 23, 2026
8f507c8
📝 Reorganize and update v1.5.0 documentation (EN/ES)
jansaldo Mar 2, 2026
e97a513
🚚 Rename full-paragraph pipeline to datapublic across code and docs
jansaldo Mar 2, 2026
c979d2d
ci(tests): add API + pipeline integration tests on linux and windows …
jedzill4 Mar 13, 2026
03796ac
Feature/pdf layout anonymization (#76)
jansaldo Apr 20, 2026
3ad788b
🐛 Remove unnecessary --extra runtime flag from uv sync command
jansaldo Apr 20, 2026
87c7892
🐛 Date formatter bug fixed for canonical entities generation.
conrabeatriz Apr 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# syntax=docker/dockerfile:1.3
ARG UV_VERSION=0.5.10
ARG UV_VERSION=latest
ARG CORE_IMAGE=ubuntu:noble

FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS astral-uv-source
Expand Down
13 changes: 9 additions & 4 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
"name": "aymurai",
// "initializeCommand": "make core-build",
"dockerComposeFile": "docker-compose.yml",
"service": "aymurai-devcontainer-gpu",
"service": "aymurai-devcontainer",
"runServices": [
"aymurai-devcontainer-gpu"
"aymurai-devcontainer"
],
"workspaceFolder": "/workspace",
"mounts": [
"source=codex-data,target=/home/ubuntu/.codex,type=volume"
],
"customizations": {
"vscode": {
"settings": {
Expand Down Expand Up @@ -47,9 +50,11 @@
"cweijan.vscode-database-client2",
"christian-kohler.path-intellisense",
"github.vscode-github-actions",
"seatonjiang.gitmoji-vscode"
"seatonjiang.gitmoji-vscode",
"openai.chatgpt"
]
}
},
"postCreateCommand": "bash /home/ubuntu/entrypoint.sh"
"postCreateCommand": "bash /home/ubuntu/entrypoint.sh",
"postStartCommand": "sudo chown -R ubuntu:ubuntu /home/ubuntu/.codex && sudo chmod -R u+rwX /home/ubuntu/.codex"
}
6 changes: 4 additions & 2 deletions .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ x-template: &template
- ..:/workspace:cached
- ../notebooks/:/notebooks
- ../resources/:/resources
- ../test/:/test
- ../tests/:/tests
- $HOME/.ssh/:/home/ubuntu/.ssh
- /var/run/docker.sock:/var/run/docker.sock
env_file:
Expand All @@ -27,7 +27,9 @@ services:
resources:
reservations:
devices:
- capabilities: [ gpu ]
- driver: nvidia
count: all
capabilities: [ gpu ]

aymurai-devcontainer:
<<: *template
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/sh

# install dependencies
uv sync --frozen --all-extras
uv sync --frozen --all-extras --all-groups

# configure precommit
uv run pre-commit install
Expand Down
27 changes: 2 additions & 25 deletions .env.common
Original file line number Diff line number Diff line change
@@ -1,31 +1,8 @@
DATASETS_BASEPATH=/resources/datasets
MODELS_BASEPATH=/resources/models

CACHE_PATH=/resources/cache
AYMURAI_CACHE_BASEPATH=/resources/cache/aymurai

CACHE_PATH=/resources/cache
HF_DATASETS_CACHE=/resources/cache/huggingface/cache
TRANSFORMERS_CACHE=/resources/cache/huggingface/transformers
HF_HOME=/resources/cache/huggingface
TOKENIZERS_PARALLELISM=1

TESSDATA_PREFIX=/usr/local/share/tessdata

AYMURAI_RESTRICTED_DOCUMENT_PDFS_PATH="/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO-pdf"
AYMURAI_RESTRICTED_DOCUMENT_DOCS_PATH="/resources/data/restricted/ar-juz-pcyf-10/RESOLUCIONES DEL JUZGADO"

TF_CPP_MIN_LOG_LEVEL=3
TFHUB_CACHE_DIR=/resources/cache/tfhub_modules

TORCH_VERSION=2.0.1
CUDA_VERSION=cu118

CORE_IMAGE_CUDA=registry.gitlab.com/collective.ai/datagenero-public/aymurai-core
CORE_IMAGE_CPU=registry.gitlab.com/collective.ai/datagenero-public/aymurai-core-cpu
API_IMAGE=registry.gitlab.com/collective.ai/datagenero-public/aymurai-api

API_HOST=0.0.0.0
API_PORT=8899

SRC_VOLUME_MOUNT=src:/src
RESOURCES_VOLUME_MOUNT=resources:/resources
NOTEBOOKS_VOLUME_MOUNT=notebooks:/notebooks
86 changes: 86 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: pytest

on:
pull_request:
types:
- opened
- synchronize
- reopened
- ready_for_review
workflow_dispatch:

permissions:
contents: read

concurrency:
group: pr-tests-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
pytest:
if: ${{ github.event_name != 'pull_request' || github.event.pull_request.draft == false }}
name: pytest (${{ matrix.os }}, py${{ matrix.python-version }})
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
- windows-latest
python-version:
- "3.10"
# - "3.11"
# - "3.12"
# - "3.13"
# - "3.14"
timeout-minutes: 30

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup uv
uses: astral-sh/setup-uv@v5
with:
version: latest
enable-cache: true
cache-dependency-glob: uv.lock

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Configure es_AR locale (Ubuntu)
if: runner.os == 'Linux'
run: |
sudo apt-get update
sudo apt-get install --yes locales
sudo locale-gen es_AR.UTF-8
sudo update-locale LANG=es_AR.UTF-8 LC_ALL=es_AR.UTF-8
echo "LANG=es_AR.UTF-8" >> "$GITHUB_ENV"
echo "LC_ALL=es_AR.UTF-8" >> "$GITHUB_ENV"
locale -a

- name: Install dependencies
run: |
uv sync --frozen --python python --no-dev --no-python-downloads --group tests

- name: Run api tests
env:
DISKCACHE_ROOT: /tmp
run: uv run --no-sync pytest -q --tb=short --disable-warnings --color=yes --maxfail=5 tests/api

- name: Download pipelines data
env:
DISKCACHE_ROOT: /tmp
RESOURCES_BASEPATH: resources
AYMURAI_CACHE_BASEPATH: resources/cache/aymurai
run: uv run --no-sync pipeline-download

- name: Run pipeline tests
env:
DISKCACHE_ROOT: /tmp
RESOURCES_BASEPATH: resources
AYMURAI_CACHE_BASEPATH: resources/cache/aymurai
run: uv run --no-sync pytest -q --tb=short --disable-warnings --color=yes --maxfail=5 tests/integration/pipelines
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,10 @@ resources
.venv

aymurai/version.py

.agents
.opencode.json
.sisyphus
notebooks/**
!notebooks/**/
!notebooks/**/*.ipynb
15 changes: 5 additions & 10 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
repos:
- repo: https://github.com/ambv/black
rev: 22.6.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.7.2
hooks:
- id: black
language_version: python3.10
- id: ruff
- id: ruff-format

- repo: https://github.com/kynan/nbstripout
rev: 0.6.0
hooks:
- id: nbstripout

# - repo: https://github.com/pre-commit/pre-commit-hooks
# rev: v2.20.0
# hooks:
# - id: flake8
- id: nbstripout
28 changes: 28 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python-envs.defaultEnvManager": "ms-python.python:venv",
"python-envs.pythonProjects": [],
"files.associations": {
".csv": "csv",
".env*": "dotenv",
".json": "json",
".jsonc": "jsonc",
".jsonl": "jsonl",
".md": "markdown"
},
"github.copilot.enable": {
"*": true,
"dotenv": false,
"csv": false,
"json": false,
"jsonc": false,
"jsonl": false,
"markdown": false,
"plaintext": false,
"scminput": false
}
}
29 changes: 23 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,37 @@ export $(shell sed 's/=.*//' .env)
include .env.common
export $(shell sed 's/=.*//' .env.common)

# Select which API service to control (override with API_SERVICE=aymurai-api-gpu)
API_SERVICE ?= aymurai-api
# Select which full API service to control (override with API_FULL_SERVICE=aymurai-api-full-gpu)
API_FULL_SERVICE ?= aymurai-api-full


api-build:
docker compose build aymurai-api
docker compose build $(API_SERVICE)
api-run:
docker compose run --service-ports aymurai-api
docker compose run --service-ports $(API_SERVICE)
api-up:
docker compose up -d $(API_SERVICE)
api-stop:
docker compose stop $(API_SERVICE)
api-logs:
docker compose logs -f $(API_SERVICE)
api-pull:
docker compose pull aymurai-api
docker compose pull $(API_SERVICE)

api-full-build:
docker compose build aymurai-api-full
docker compose build $(API_FULL_SERVICE)
api-full-run:
docker compose run --service-ports aymurai-api-full
docker compose run --service-ports $(API_FULL_SERVICE)
api-full-up:
docker compose up -d $(API_FULL_SERVICE)
api-full-stop:
docker compose stop $(API_FULL_SERVICE)
api-full-logs:
docker compose logs -f $(API_FULL_SERVICE)
api-full-pull:
docker compose pull aymurai-api-full
docker compose pull $(API_FULL_SERVICE)

stress-test:
locust -f locustfile.py --host http://localhost:8899
Expand Down
Loading
Loading