Dockerfile_GenModel

# Build an image that can do inference in SageMaker
# This is a Python image that uses the nginx, gunicorn, flask stack

FROM public.ecr.aws/ubuntu/ubuntu:20.04_stable

# See http://bugs.python.org/issue19846
ENV LANG C.UTF-8
ARG PYTHON=python3
		 
RUN apt-get update && DEBIAN_FRONTEND="noninteractive" TZ="America/New_York" apt-get install -y tzdata

RUN apt-get -y update && apt-get install -y --no-install-recommends --fix-missing \
         wget \
         python \
         nginx \
         ca-certificates \
         build-essential \
         git \
         curl \
         ${PYTHON} \
         ${PYTHON}-pip \
         google-perftools \
         libjemalloc-dev \
         numactl &&\
         rm -rf /var/lib/apt/lists/*


RUN apt-get clean

ENV PYTHON_VERSION=3.8

RUN pip install ipykernel && \
    ${PYTHON} -m ipykernel install --sys-prefix

RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
    pip \
    psutil \
    setuptools \
    boto3 \
    sagemaker \
    transformers \
    datasets \
    awscli

# Some TF tools expect a "python" binary
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python

ARG IPEX_VERSION=1.11.0
ARG PYTORCH_VERSION=1.11.0+cpu
ARG TORCHAUDIO_VERSION=0.11.0
ARG TORCHVISION_VERSION=0.12.0+cpu
ARG TORCH_CPU_URL=https://download.pytorch.org/whl/cpu/torch_stable.html
ARG IPEX_URL=https://software.intel.com/ipex-whl-stable

RUN ${PYTHON} -m pip install --no-cache-dir \
    intel-openmp \
    torch==${PYTORCH_VERSION} torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} -f ${TORCH_CPU_URL} && \
    ${PYTHON} -m pip install --no-cache-dir \
    intel_extension_for_pytorch==${IPEX_VERSION} -f ${IPEX_URL}

RUN ln -sf /usr/lib/x86_64-linux-gnu/libjemalloc.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so
ENV DNNL_MAX_CPU_ISA="AVX512_CORE_AMX"
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libjemalloc.so:/usr/local/lib/libiomp5.so:/usr/lib/x86_64-linux-gnu/libtcmalloc.so":${LD_PRELOAD}

# Here we install the extra python packages to run the inference code
RUN pip install flask gevent gunicorn && \
        rm -rf /root/.cache

# Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard
# output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE
# keeps Python from writing the .pyc files which are unnecessary in this case. We also update
# PATH so that the train and serve programs are found when the container is invoked.

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/program:${PATH}"
ENV SM_MODEL_DIR="/opt/program/model"

# Set up the program in the image
COPY bert_flask /opt/program

RUN chmod 755 /opt/program
WORKDIR /opt/program
RUN chmod 755 serve

RUN python quantize_with_ds_ep.py 
RUN tar -czf both_bert_model-test.tar.gz model_int8.pt model_fp32.pt tokenizer.json vocab.txt special_tokens_map.json tokenizer_config.json