From ad71aff21272366cfe305ad32f80a327855c2cf2 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 29 Jan 2025 11:47:43 +0100 Subject: [PATCH] make the image openshift-friendly Signed-off-by: Michele Dolfi --- Containerfile | 63 +++++++++++++++++++++++++++++++++------------- models_download.py | 36 ++++++++++++++++++++++++++ os-packages.txt | 8 ++++++ 3 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 models_download.py create mode 100644 os-packages.txt diff --git a/Containerfile b/Containerfile index ebfc4cc..2dabcd2 100644 --- a/Containerfile +++ b/Containerfile @@ -1,32 +1,61 @@ -FROM python:3.11-slim-bookworm +ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s + +FROM ${BASE_IMAGE} ARG CPU_ONLY=false -WORKDIR /docling-serve -RUN apt-get update \ - && apt-get install -y libgl1 libglib2.0-0 curl wget git \ - && apt-get clean +USER 0 -RUN pip install --no-cache-dir poetry +################################################################################################### +# OS Layer # +################################################################################################### -COPY pyproject.toml poetry.lock README.md /docling-serve/ +RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \ + dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \ + dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \ + dnf config-manager --enable crb && \ + dnf -y update && \ + dnf install -y $(cat /tmp/os-packages.txt) && \ + dnf -y clean all && \ + rm -rf /var/cache/dnf -RUN if [ "$CPU_ONLY" = "true" ]; then \ - poetry install --no-root --with cpu; \ - else \ - poetry install --no-root; \ - fi +ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ -ENV HF_HOME=/tmp/ -ENV TORCH_HOME=/tmp/ +################################################################################################### +# Docling layer # +################################################################################################### -RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' +USER 1001 + +WORKDIR /opt/app-root/src # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 -COPY ./docling_serve /docling-serve/docling_serve +ENV LANG=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 +ENV PYTHONIOENCODING=utf-8 + +ENV WITH_UI=True + +COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./ + +RUN pip install --no-cache-dir poetry && \ + # We already are in a virtual environment, so we don't need to create a new one, only activate it. + poetry config virtualenvs.create false && \ + source /opt/app-root/bin/activate && \ + if [ "$CPU_ONLY" = "true" ]; then \ + poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \ + else \ + poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \ + fi && \ + echo "Downloading models..." && \ + python models_download.py && \ + chown -R 1001:0 /opt/app-root/src && \ + chmod -R g=u /opt/app-root/src + +COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve EXPOSE 5001 -CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"] +CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"] diff --git a/models_download.py b/models_download.py new file mode 100644 index 0000000..b3eff3a --- /dev/null +++ b/models_download.py @@ -0,0 +1,36 @@ +import os +import zipfile + +import requests +from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models +from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline + +# Download Docling models +StandardPdfPipeline.download_models_hf(force=True) +load_pretrained_nlp_models(verbose=True) + +# Download EasyOCR models +urls = [ + "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip", + "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" +] + +local_zip_paths = [ + "/opt/app-root/src/latin_g2.zip", + "/opt/app-root/src/craft_mlt_25k.zip" +] + +extract_path = "/opt/app-root/src/.EasyOCR/model/" + +for url, local_zip_path in zip(urls, local_zip_paths): + # Download the file + response = requests.get(url) + with open(local_zip_path, "wb") as file: + file.write(response.content) + + # Unzip the file + with zipfile.ZipFile(local_zip_path, "r") as zip_ref: + zip_ref.extractall(extract_path) + + # Clean up the zip file + os.remove(local_zip_path) diff --git a/os-packages.txt b/os-packages.txt new file mode 100644 index 0000000..ec3e929 --- /dev/null +++ b/os-packages.txt @@ -0,0 +1,8 @@ +tesseract +tesseract-devel +tesseract-langpack-eng +leptonica-devel +libglvnd-glx +glib2 +wget +git \ No newline at end of file