facebookresearch · aospan · Apr 19, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,73 @@
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
+
+# Install conda
+# copy-pasted from https://github.com/anaconda/docker-images/blob/main/anaconda3/debian/Dockerfile
+ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
+ENV PATH=/opt/conda/bin:$PATH
+
+# renovate: datasource=custom.anaconda_installer
+ARG INSTALLER_URL_LINUX64="https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh"
+ARG SHA256SUM_LINUX64="3ba0a298155c32fbfd80cbc238298560bf69a2df511783054adfc151b76d80d8"
+# renovate: datasource=custom.anaconda_installer
+ARG INSTALLER_URL_S390X="https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-s390x.sh"
+ARG SHA256SUM_S390X="e00bd5e6c275695e8050a45aa85790315f504c95243dfe3632f505284310f3c4"
+# renovate: datasource=custom.anaconda_installer
+ARG INSTALLER_URL_AARCH64="https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-aarch64.sh"
+ARG SHA256SUM_AARCH64="489c608e8bddd2cf29dfbdd811cf99087cd6b6a0615d41c6f9058ce340594b65"
+
+# hadolint ignore=DL3008
+RUN set -x && \
+    apt-get update --fix-missing && \
+    apt-get install -y --no-install-recommends \
+        bzip2 \
+        ca-certificates \
+        git \
+        libglib2.0-0 \
+        libsm6 \
+        libxcomposite1 \
+        libxcursor1 \
+        libxdamage1 \
+        libxext6 \
+        libxfixes3 \
+        libxi6 \
+        libxinerama1 \
+        libxrandr2 \
+        libxrender1 \
+        mercurial \
+        openssh-client \
+        procps \
+        subversion \
+        wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* && \
+    UNAME_M="$(uname -m)" && \
+    if [ "${UNAME_M}" = "x86_64" ]; then \
+        INSTALLER_URL=${INSTALLER_URL_LINUX64}; \
+        SHA256SUM=${SHA256SUM_LINUX64}; \
+    elif [ "${UNAME_M}" = "s390x" ]; then \
+        INSTALLER_URL=${INSTALLER_URL_S390X}; \
+        SHA256SUM=${SHA256SUM_S390X}; \
+    elif [ "${UNAME_M}" = "aarch64" ]; then \
+        INSTALLER_URL=${INSTALLER_URL_AARCH64}; \
+        SHA256SUM=${SHA256SUM_AARCH64}; \
+    fi && \
+    wget "${INSTALLER_URL}" -O anaconda.sh -q && \
+    echo "${SHA256SUM} anaconda.sh" > shasum && \
+    sha256sum --check --status shasum && \
+    /bin/bash anaconda.sh -b -p /opt/conda && \
+    rm anaconda.sh shasum && \
+    ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate" >> ~/.bashrc && \
+    find /opt/conda/ -follow -type f -name '*.a' -delete && \
+    find /opt/conda/ -follow -type f -name '*.js.map' -delete && \
+    /opt/conda/bin/conda clean -afy
+
+# Install Meta perception_models
+RUN git clone https://github.com/facebookresearch/perception_models.git \
+    && cd perception_models \
+    && conda create -y --name perception_models python=3.12 \
+    && conda run --no-capture-output -n perception_models pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 xformers --index-url https://download.pytorch.org/whl/cu124 \
+    && conda install -n perception_models -y ffmpeg -c conda-forge \
+    && conda run --no-capture-output -n perception_models pip install torchcodec==0.1 --index-url=https://download.pytorch.org/whl/cu124 \
+    && conda run --no-capture-output -n perception_models pip install -e .
diff --git a/README.md b/README.md
@@ -52,6 +52,38 @@ pip install -e .
 This will install an editable version of repo, allowing you to make changes to the code without needing to reinstall the package every time.
 
 
+## Run as Docker Container
+
+### Build Docker Container:
+```bash
+docker build -t facebookresearch/perception_models .
+```
+
+### Run Example:
+
+To run the model using Docker, first copy your video to `/tmp/demo.mp4`. Replace `HF_TOKEN` with your Hugging Face token, and adjust `MODEL` and `QUESTION` according to your needs:
+
+```bash
+HF_TOKEN=hf_REPLACE_ME
+QUESTION='What is happening in the video?'
+MEDIA='/tmp/demo.mp4'
+MODEL='facebook/Perception-LM-1B'
+
+docker run -it --gpus all \
+    -v /tmp:/tmp \
+    -e HF_TOKEN="${HF_TOKEN}" \
+    facebookresearch/perception_models \
+    conda run -n perception_models --no-capture-output \
+    python3 /perception_models/apps/plm/generate.py \
+    --ckpt "${MODEL}" \
+    --media_type video \
+    --question "${QUESTION}" \
+    --media_path "${MEDIA}"
+```
+
+This command runs the Docker container with available Nvidia GPUs attached.
+
+
 ## 🙏 Acknowledgement
 We are thankful to [Meta Lingua](https://github.com/facebookresearch/lingua) for releasing their code as open-source contributions. The code structure and code implementation of the LLM is directly forked from [Meta Lingua](https://github.com/facebookresearch/lingua). We are also thankful to [Open_CLIP](https://github.com/mlfoundations/open_clip) for open-source contributions in CLIP training, and [CLIP_benchmark](https://github.com/LAION-AI/CLIP_benchmark) for CLIP model evaluation.