wip vertex ai

philschmid · philschmid · commit 9390c55e37db · 2024-05-27T12:18:51.000Z
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ HF_MODEL_ID=hf-internal-testing/tiny-random-distilbert HF_MODEL_DIR=tmp2 HF_TASK
 ### Container
 
 
-1. build the preferred container for either CPU or GPU for PyTorch or TensorFlow.
+1. build the preferred container for either CPU or GPU for PyTorch o.
 
 _cpu images_
 ```bash
@@ -58,6 +58,32 @@ curl --request POST \
 }'
 ```
 
+### Vertex AI Support
+
+The Hugging Face Inference Toolkit is also supported on Vertex AI, based on [Custom container requirements for prediction](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements). [Enviornment variables set by Vertex AI](https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements#aip-variables) are automatically detected and used by the toolkit. 
+
+#### Local run with HF_MODEL_ID and HF_TASK
+
+Start Hugging Face Inference Toolkit with the following environment variables. 
+
+```bash
+mkdir tmp2/
+AIP_MODE=PREDICTION AIP_PORT=8080 AIP_PREDICT_ROUTE=/pred AIP_HEALTH_ROUTE=/h HF_MODEL_DIR=tmp2 HF_MODEL_ID=distilbert/distilbert-base-uncased-finetuned-sst-2-english HF_TASK=text-classification uvicorn src.huggingface_inference_toolkit.webservice_starlette:app  --port 8080
+```
+
+Send request. The API schema is the same as from the [inference API](https://huggingface.co/docs/api-inference/detailed_parameters)
+
+```bash
+curl --request POST \
+  --url http://localhost:8080/pred \
+  --header 'Content-Type: application/json' \
+  --data '{
+	"instances": ["I love this product", "I hate this product"],
+	"parameters": { "top_k": 2 }
+}'
+```
+
+
 
 ---
 
@@ -176,6 +202,7 @@ Below you ll find a list of supported and tested transformers and sentence trans
 ##  ⚙ Supported Frontend
 
 - [x] Starlette (HF Endpoints)
+- [ ] Starlette (Vertex AI)
 - [ ] Starlette (Azure ML)
 - [ ] Starlette (SageMaker)
 
diff --git a/dockerfiles/inference-endpoints/Dockerfile b/dockerfiles/inference-endpoints/Dockerfile
@@ -0,0 +1,48 @@
+ARG BASE_IMAGE=nvidia/cuda:12.1.0-devel-ubuntu22.04
+
+FROM $BASE_IMAGE
+SHELL ["/bin/bash", "-c"]
+
+LABEL maintainer="Hugging Face"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install software-properties-common -y && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get -y upgrade --only-upgrade systemd openssl cryptsetup && \
+    apt-get install -y \
+        build-essential \
+        bzip2 \
+        curl \
+        git \
+        git-lfs \
+        tar \
+        gcc \
+        g++ \
+        cmake \
+        libprotobuf-dev \
+        protobuf-compiler \
+        python3-dev \
+        python3-pip \
+        python3.11 \
+        libsndfile1-dev \
+        ffmpeg \
+    && apt-get clean autoremove --yes \
+    && rm -rf /var/lib/{apt,dpkg,cache,log}
+# Copying only necessary files as filtered by .dockerignore
+COPY . .
+
+# install wheel and setuptools
+RUN pip install --no-cache-dir -U pip ".[torch, st, diffusers]"
+
+# copy application
+COPY src/huggingface_inference_toolkit huggingface_inference_toolkit
+COPY src/huggingface_inference_toolkit/webservice_starlette.py webservice_starlette.py
+
+# copy entrypoint and change permissions
+COPY --chmod=0755  scripts/entrypoint.sh entrypoint.sh
+
+ENTRYPOINT ["bash", "-c", "./entrypoint.sh"]
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -1,13 +1,21 @@
-# /bin/bash
+#!/bin/bash
 
-# check if HF_MODEL_DIR is set and if not skip installing custom dependencies
+# Define the default port
+PORT=5000
+
+# Check if AIP_MODE is set and adjust the port for Vertex AI
+if [[ ! -z "${AIP_MODE}" ]]; then
+  PORT=${AIP_HTTP_PORT}
+fi
+
+# Check if HF_MODEL_DIR is set and if not skip installing custom dependencies
 if [[ ! -z "${HF_MODEL_DIR}" ]]; then
-  # check if requirements.txt exists and if so install dependencies
+  # Check if requirements.txt exists and if so install dependencies
   if [ -f "${HF_MODEL_DIR}/requirements.txt" ]; then
     echo "Installing custom dependencies from ${HF_MODEL_DIR}/requirements.txt"
     pip install -r ${HF_MODEL_DIR}/requirements.txt --no-cache-dir;
   fi
 fi
 
-# start the server
-uvicorn webservice_starlette:app --host 0.0.0.0 --port 5000
+# Start the server
+uvicorn webservice_starlette:app --host 0.0.0.0 --port ${PORT}
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from typing import Optional, Union
 
@@ -40,15 +41,52 @@ def __call__(self, data):
         return prediction
 
 
+class VertexAIHandler(HuggingFaceHandler):
+    """
+    A Default Vertex AI Hugging Face Inference Handler which abstracts the
+    Vertex AI specific logic for inference.
+    """
+    def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
+        super().__init__(model_dir, task, framework)
+    
+    def __call__(self, data):
+        """
+        Handles an inference request with input data and makes a prediction.
+        Args:
+            :data: (obj): the raw request body data.
+        :return: prediction output
+        """
+        if "instances" not in data:
+            raise ValueError("The request body must contain a key 'instances' with a list of instances.")
+        parameters = data.pop("parameters", None)
+        
+        predictions = []
+        # iterate over all instances and make predictions
+        for inputs in data["instances"]:
+            payload = {"inputs": inputs, "parameters": parameters}
+            predictions.append(super().__call__(payload))
+        
+        # reutrn predictions
+        return {"predictions": predictions}
+
 def get_inference_handler_either_custom_or_default_handler(
     model_dir: Path,
     task: Optional[str] = None
 ):
     """
-    get inference handler either custom or default Handler
+    Returns the appropriate inference handler based on the given model directory and task.
+    
+    Args:
+        model_dir (Path): The directory path where the model is stored.
+        task (Optional[str]): The task for which the inference handler is required. Defaults to None.
+    
+    Returns:
+        InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
     custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
     if custom_pipeline:
         return custom_pipeline
+    elif os.environ.get("AIP_MODE", None) == "PREDICTION": 
+        return VertexAIHandler(model_dir=model_dir, task=task)
     else:
         return HuggingFaceHandler(model_dir=model_dir, task=task)
diff --git a/src/huggingface_inference_toolkit/vertex_ai_utils.py b/src/huggingface_inference_toolkit/vertex_ai_utils.py
@@ -0,0 +1,46 @@
+import logging
+from pathlib import Path
+import re
+from typing import Union
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(format="%(asctime)s | %(levelname)s | %(message)s", level=logging.INFO)
+
+from google.cloud import storage
+
+_logger = logging.getLogger(__name__)
+
+
+GCS_URI_PREFIX = "gs://"
+
+
+# copied from https://github.com/googleapis/python-aiplatform/blob/94d838d8cfe1599bc2d706e66080c05108821986/google/cloud/aiplatform/utils/prediction_utils.py#L121
+def _load_repository_from_gcs(artifact_uri: str, target_dir: Union[str, Path]="/tmp"):
+    """
+    Load files from GCS path to target_dir
+    """
+    _logger.info(f"Loading model artifacts from {artifact_uri} to {target_dir}")
+    target_dir = Path(target_dir)
+    
+    if artifact_uri.startswith(GCS_URI_PREFIX):
+        matches = re.match(f"{GCS_URI_PREFIX}(.*?)/(.*)", artifact_uri)
+        bucket_name, prefix = matches.groups()
+
+        gcs_client = storage.Client()
+        blobs = gcs_client.list_blobs(bucket_name, prefix=prefix)
+        for blob in blobs:
+            name_without_prefix = blob.name[len(prefix) :]
+            name_without_prefix = (
+                name_without_prefix[1:]
+                if name_without_prefix.startswith("/")
+                else name_without_prefix
+            )
+            file_split = name_without_prefix.split("/")
+            directory = target_dir.join(file_split[0:-1])
+            directory.mkdir(parents=True, exist_ok=True)
+            if name_without_prefix and not name_without_prefix.endswith("/"):
+                blob.download_to_filename(name_without_prefix)
+
+    return str(target_dir.absolute())
+
diff --git a/src/huggingface_inference_toolkit/webservice_robyn.py b/src/huggingface_inference_toolkit/webservice_robyn.py
diff --git a/src/huggingface_inference_toolkit/webservice_starlette.py b/src/huggingface_inference_toolkit/webservice_starlette.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -20,6 +21,7 @@
 from huggingface_inference_toolkit.serialization.base import ContentType
 from huggingface_inference_toolkit.serialization.json_utils import Jsoner
 from huggingface_inference_toolkit.utils import _load_repository_from_hf, convert_params_to_int_or_bool
+from huggingface_inference_toolkit.vertex_ai_utils import _load_repository_from_gcs
 
 
 def config_logging(level=logging.INFO):
@@ -35,10 +37,11 @@ def config_logging(level=logging.INFO):
 logger = logging.getLogger(__name__)
 
 
-async def some_startup_task():
+async def prepare_model_artifacts():
     global inference_handler
     # 1. check if model artifacts available in HF_MODEL_DIR
     if len(list(Path(HF_MODEL_DIR).glob("**/*"))) <= 0:
+        # 2. if not available, try to load from HF_MODEL_ID
         if HF_MODEL_ID is not None:
             _load_repository_from_hf(
                 repository_id=HF_MODEL_ID,
@@ -47,6 +50,11 @@ async def some_startup_task():
                 revision=HF_REVISION,
                 hf_hub_token=HF_HUB_TOKEN,
             )
+        # 3. check if in Vertex AI environment and load from GCS
+        # If artifactUri not on Model Creation not set returns an empty string
+        elif len(os.environ.get("AIP_STORAGE_URI", '')) > 0: 
+            _load_repository_from_gcs(os.environ["AIP_STORAGE_URI"], target_dir=HF_MODEL_DIR)
+        # 4. if not available, raise error
         else:
             raise ValueError(
                 f"""Can't initialize model.
@@ -72,7 +80,7 @@ async def predict(request):
         # try to deserialize payload
         deserialized_body = ContentType.get_deserializer(content_type).deserialize(await request.body())
         # checks if input schema is correct
-        if "inputs" not in deserialized_body:
+        if "inputs" not in deserialized_body and "instances" not in deserialized_body:
             raise ValueError(f"Body needs to provide a inputs key, recieved: {orjson.dumps(deserialized_body)}")
 
         # check for query parameter and add them to the body
@@ -97,14 +105,31 @@ async def predict(request):
         logger.error(e)
         return Response(Jsoner.serialize({"error": str(e)}), status_code=400, media_type="application/json")
 
-
-app = Starlette(
-    debug=True,
-    routes=[
-        Route("/", health, methods=["GET"]),
-        Route("/health", health, methods=["GET"]),
-        Route("/", predict, methods=["POST"]),
-        Route("/predict", predict, methods=["POST"]),
-    ],
-    on_startup=[some_startup_task],
+# Create app based on which cloud environment is used
+if os.getenv("AIP_MODE", None) == "PREDICTION":
+    logger.info("Running in Vertex AI environment")
+    # extract routes from environment variables
+    _predict_route = os.getenv("AIP_PREDICT_ROUTE", None)
+    _health_route = os.getenv("AIP_HEALTH_ROUTE", None)
+    if _predict_route is None or _health_route is None:
+        raise ValueError("AIP_PREDICT_ROUTE and AIP_HEALTH_ROUTE need to be set in Vertex AI environment")    
+    
+    app = Starlette(
+        debug=False,
+        routes=[
+            Route(_health_route, health, methods=["GET"]),
+            Route(_predict_route, predict, methods=["POST"]),
+        ],
+        on_startup=[prepare_model_artifacts],
+    )    
+else:
+    app = Starlette(
+        debug=False,
+        routes=[
+            Route("/", health, methods=["GET"]),
+            Route("/health", health, methods=["GET"]),
+            Route("/", predict, methods=["POST"]),
+            Route("/predict", predict, methods=["POST"]),
+        ],
+        on_startup=[prepare_model_artifacts],
 )