huggingface · alvarobartt · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025 · Aug 14, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -110,14 +110,5 @@ FROM base AS http
 
 COPY --from=http-builder /usr/src/target/release/text-embeddings-router /usr/local/bin/text-embeddings-router
 
-# Amazon SageMaker compatible image
-FROM http AS sagemaker
-COPY --chmod=775 sagemaker-entrypoint.sh entrypoint.sh
-
-ENTRYPOINT ["./entrypoint.sh"]
-
-# Default image
-FROM http
-
 ENTRYPOINT ["text-embeddings-router"]
 CMD ["--json-output"]
diff --git a/Dockerfile-cuda-all b/Dockerfile-cuda-all
@@ -35,7 +35,6 @@ FROM base-builder AS builder
 
 ARG GIT_SHA
 ARG DOCKER_LABEL
-ARG VERTEX="false"
 
 # sccache specific variables
 ARG SCCACHE_GHA_ENABLED
@@ -51,39 +50,19 @@ COPY --from=planner /usr/src/recipe.json recipe.json
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      cargo chef cook --release --features google --recipe-path recipe.json && sccache -s; \
-    else \
-      cargo chef cook --release --recipe-path recipe.json && sccache -s; \
-    fi;
+    cargo chef cook --release --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features google --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=75 cargo chef cook --release --features candle-cuda-turing --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=80 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features google --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    else \
-      CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=90 cargo chef cook --release --features candle-cuda --recipe-path recipe.json && sccache -s;
 
 COPY backends backends
 COPY core core
@@ -93,34 +72,19 @@ COPY Cargo.lock ./
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=75 cargo build --release --bin text-embeddings-router -F candle-cuda-turing && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-75
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=80 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-80
 
 RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
-    if [ $VERTEX = "true" ]; \
-    then \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda -F google  && sccache -s; \
-    else \
-        CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s; \
-    fi;
+    CUDA_COMPUTE_CAP=90 cargo build --release --bin text-embeddings-router -F candle-cuda && sccache -s;
 
 RUN mv /usr/src/target/release/text-embeddings-router /usr/src/target/release/text-embeddings-router-90
 
@@ -142,16 +106,6 @@ COPY --from=builder /usr/src/target/release/text-embeddings-router-75 /usr/local
 COPY --from=builder /usr/src/target/release/text-embeddings-router-80 /usr/local/bin/text-embeddings-router-80
 COPY --from=builder /usr/src/target/release/text-embeddings-router-90 /usr/local/bin/text-embeddings-router-90
 
-# Amazon SageMaker compatible image
-FROM base AS sagemaker
-
-COPY --chmod=775 sagemaker-entrypoint-cuda-all.sh entrypoint.sh
-
-ENTRYPOINT ["./entrypoint.sh"]
-
-# Default image
-FROM base
-
 COPY --chmod=775 cuda-all-entrypoint.sh entrypoint.sh
 
 ENTRYPOINT ["./entrypoint.sh"]

diff --git a/cuda-all-entrypoint.sh b/cuda-all-entrypoint.sh
@@ -1,21 +1,19 @@
 #!/bin/bash
 
-if ! command -v nvidia-smi &> /dev/null; then
+if ! command -v nvidia-smi &>/dev/null; then
     echo "Error: 'nvidia-smi' command not found."
     exit 1
 fi
 
 compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
 
-if [ ${compute_cap} -eq 75 ]
-then
+if [ ${compute_cap} -eq 75 ]; then
     exec text-embeddings-router-75 "$@"
-elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]
-then
+elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then
     exec text-embeddings-router-80 "$@"
-elif [ ${compute_cap} -eq 90 ]
-then
+elif [ ${compute_cap} -eq 90 ]; then
     exec text-embeddings-router-90 "$@"
 else
-    echo "cuda compute cap ${compute_cap} is not supported"; exit 1
+    echo "cuda compute cap ${compute_cap} is not supported"
+    exit 1
 fi
diff --git a/sagemaker-entrypoint-cuda-all.sh b/sagemaker-entrypoint-cuda-all.sh
@@ -1,14 +1,21 @@
 #!/bin/bash
 
+if ! command -v nvidia-smi &>/dev/null; then
+    echo "Error: 'nvidia-smi' command not found."
+    exit 1
+fi
+
+# Function to compare version numbers
 verlte() {
     [ "$1" = "$2" ] && return 1 || [ "$2" = "$(echo -e "$1\n$2" | sort -V | head -n1)" ]
 }
 
+# CUDA compat libs logic
 if [ -f /usr/local/cuda/compat/libcuda.so.1 ]; then
     CUDA_COMPAT_MAX_DRIVER_VERSION=$(readlink /usr/local/cuda/compat/libcuda.so.1 | cut -d"." -f 3-)
     echo "CUDA compat package requires Nvidia driver ≤${CUDA_COMPAT_MAX_DRIVER_VERSION}"
     cat /proc/driver/nvidia/version
-    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
+    NVIDIA_DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module \([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
     echo "Current installed Nvidia driver version is ${NVIDIA_DRIVER_VERSION}"
     if [ $(verlte "$CUDA_COMPAT_MAX_DRIVER_VERSION" "$NVIDIA_DRIVER_VERSION") ]; then
         echo "Setup CUDA compatibility libs path to LD_LIBRARY_PATH"
@@ -21,71 +28,27 @@ else
     echo "Skip CUDA compat libs setup as package not found"
 fi
 
+# Model variables check
 if [[ -z "${HF_MODEL_ID}" ]]; then
-  echo "HF_MODEL_ID must be set"
-  exit 1
+    echo "HF_MODEL_ID must be set"
+    exit 1
 fi
 export MODEL_ID="${HF_MODEL_ID}"
 
 if [[ -n "${HF_MODEL_REVISION}" ]]; then
-  export REVISION="${HF_MODEL_REVISION}"
-fi
-
-if ! command -v nvidia-smi &> /dev/null; then
-    echo "Error: 'nvidia-smi' command not found."
-    exit 1
-fi
-
-# Query GPU name using nvidia-smi
-gpu_name=$(nvidia-smi --query-gpu=gpu_name --format=csv | awk 'NR==2')
-if [ $? -ne 0 ]; then
-    echo "Error: $gpu_name"
-    echo "Query gpu_name failed"
-else
-    echo "Query gpu_name succeeded. Printing output: $gpu_name"
+    export REVISION="${HF_MODEL_REVISION}"
 fi
 
-# Function to get compute capability based on GPU name
-get_compute_cap() {
-    gpu_name="$1"
-
-    # Check if the GPU name contains "A10G"
-    if [[ "$gpu_name" == *"A10G"* ]]; then
-        echo "86"
-    # Check if the GPU name contains "A100"
-    elif [[ "$gpu_name" == *"A100"* ]]; then
-        echo "80"
-    # Check if the GPU name contains "H100"
-    elif [[ "$gpu_name" == *"H100"* ]]; then
-        echo "90"
-    # Cover Nvidia T4
-    elif [[ "$gpu_name" == *"T4"* ]]; then
-        echo "75"
-    # Cover Nvidia L4
-    elif [[ "$gpu_name" == *"L4"* ]]; then
-        echo "89"
-    else
-        echo "80"  # Default compute capability
-    fi
-}
-
-if [[ -z "${CUDA_COMPUTE_CAP}" ]]
-then
-    compute_cap=$(get_compute_cap "$gpu_name")
-    echo "the compute_cap is $compute_cap"
-else
-    compute_cap=$CUDA_COMPUTE_CAP
-fi
+compute_cap=$(nvidia-smi --query-gpu=compute_cap --format=csv | sed -n '2p' | sed 's/\.//g')
 
-if [[ ${compute_cap} -eq 75 ]]
-then
-    text-embeddings-router-75 --port 8080 --json-output
-elif [[ ${compute_cap} -ge 80 && ${compute_cap} -lt 90 ]]
-then
-    text-embeddings-router-80 --port 8080 --json-output
-elif [[ ${compute_cap} -eq 90 ]]
-then
-    text-embeddings-router-90 --port 8080 --json-output
+# Router selection logic
+if [ ${compute_cap} -eq 75 ]; then
+    exec text-embeddings-router-75 --port 8080 --json-output
+elif [ ${compute_cap} -ge 80 -a ${compute_cap} -lt 90 ]; then
+    exec text-embeddings-router-80 --port 8080 --json-output
+elif [ ${compute_cap} -eq 90 ]; then
+    exec text-embeddings-router-90 --port 8080 --json-output
 else
-    echo "cuda compute cap ${compute_cap} is not supported"; exit 1
+    echo "cuda compute cap ${compute_cap} is not supported"
+    exit 1
 fi
diff --git a/sagemaker-entrypoint.sh b/sagemaker-entrypoint.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 
 if [[ -z "${HF_MODEL_ID}" ]]; then
-  echo "HF_MODEL_ID must be set"
-  exit 1
+    echo "HF_MODEL_ID must be set"
+    exit 1
 fi
 export MODEL_ID="${HF_MODEL_ID}"
 
 if [[ -n "${HF_MODEL_REVISION}" ]]; then
-  export REVISION="${HF_MODEL_REVISION}"
+    export REVISION="${HF_MODEL_REVISION}"
 fi
 
-text-embeddings-router --port 8080 --json-output
+exec text-embeddings-router --port 8080 --json-output