Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Container image for OpenShift #24

Merged
merged 1 commit into from
Jan 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 46 additions & 17 deletions Containerfile
Original file line number Diff line number Diff line change
@@ -1,32 +1,61 @@
FROM python:3.11-slim-bookworm
ARG BASE_IMAGE=quay.io/sclorg/python-312-c9s:c9s

FROM ${BASE_IMAGE}

ARG CPU_ONLY=false
WORKDIR /docling-serve

RUN apt-get update \
&& apt-get install -y libgl1 libglib2.0-0 curl wget git \
&& apt-get clean
USER 0

RUN pip install --no-cache-dir poetry
###################################################################################################
# OS Layer #
###################################################################################################

COPY pyproject.toml poetry.lock README.md /docling-serve/
RUN --mount=type=bind,source=os-packages.txt,target=/tmp/os-packages.txt \
dnf -y install --best --nodocs --setopt=install_weak_deps=False dnf-plugins-core && \
dnf config-manager --best --nodocs --setopt=install_weak_deps=False --save && \
dnf config-manager --enable crb && \
dnf -y update && \
dnf install -y $(cat /tmp/os-packages.txt) && \
dnf -y clean all && \
rm -rf /var/cache/dnf

RUN if [ "$CPU_ONLY" = "true" ]; then \
poetry install --no-root --with cpu; \
else \
poetry install --no-root; \
fi
ENV TESSDATA_PREFIX=/usr/share/tesseract/tessdata/

ENV HF_HOME=/tmp/
ENV TORCH_HOME=/tmp/
###################################################################################################
# Docling layer #
###################################################################################################

RUN poetry run python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
USER 1001

WORKDIR /opt/app-root/src

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4

COPY ./docling_serve /docling-serve/docling_serve
ENV LANG=en_US.UTF-8
ENV LC_ALL=en_US.UTF-8
ENV PYTHONIOENCODING=utf-8

ENV WITH_UI=True

COPY --chown=1001:0 pyproject.toml poetry.lock models_download.py README.md ./

RUN pip install --no-cache-dir poetry && \
# We already are in a virtual environment, so we don't need to create a new one, only activate it.
poetry config virtualenvs.create false && \
source /opt/app-root/bin/activate && \
if [ "$CPU_ONLY" = "true" ]; then \
poetry install --no-root --no-cache --no-interaction --all-extras --with cpu --without dev; \
else \
poetry install --no-root --no-cache --no-interaction --all-extras --without dev; \
fi && \
echo "Downloading models..." && \
python models_download.py && \
chown -R 1001:0 /opt/app-root/src && \
chmod -R g=u /opt/app-root/src

COPY --chown=1001:0 --chmod=664 ./docling_serve ./docling_serve

EXPOSE 5001

CMD ["poetry", "run", "uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
CMD ["uvicorn", "--port", "5001", "--host", "0.0.0.0", "docling_serve.app:app"]
36 changes: 36 additions & 0 deletions models_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import zipfile

import requests
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

# Download Docling models
StandardPdfPipeline.download_models_hf(force=True)
load_pretrained_nlp_models(verbose=True)

# Download EasyOCR models
urls = [
"https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip",
"https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip"
]

local_zip_paths = [
"/opt/app-root/src/latin_g2.zip",
"/opt/app-root/src/craft_mlt_25k.zip"
]

extract_path = "/opt/app-root/src/.EasyOCR/model/"

for url, local_zip_path in zip(urls, local_zip_paths):
# Download the file
response = requests.get(url)
with open(local_zip_path, "wb") as file:
file.write(response.content)

# Unzip the file
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
zip_ref.extractall(extract_path)

# Clean up the zip file
os.remove(local_zip_path)
8 changes: 8 additions & 0 deletions os-packages.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
tesseract
tesseract-devel
tesseract-langpack-eng
leptonica-devel
libglvnd-glx
glib2
wget
git