Merge branch 'main' into auto_length

huggingface · Oct 25, 2024 · c3fb2ec · c3fb2ec
2 parents e3db525 + db68bd0
commit c3fb2ec
Show file tree

Hide file tree

Showing 82 changed files with 2,590 additions and 2,369 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Dockerfile.trtllm b/Dockerfile.trtllm
diff --git a/backends/trtllm/Dockerfile → Dockerfile_trtllm b/backends/trtllm/Dockerfile → Dockerfile_trtllm
@@ -10,7 +10,7 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     ninja-build \
     pkg-config \
     python3 \
+    python3-dev \
     python3-setuptools \
     tar \
     wget
@@ -82,10 +83,15 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
     cd backends/trtllm && \
     CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
 
-FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
+RUN apt update && apt install -y python3 && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/
+
 WORKDIR /usr/local/tgi/bin
 
 ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
 
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt

diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \
 You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
 
 ```bash
-curl localhost:3000/v1/chat/completions \
+curl localhost:8080/v1/chat/completions \
     -X POST \
     -d '{
   "model": "tgi",

diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
@@ -1,5 +1,17 @@
 cmake_minimum_required(VERSION 3.20)
 
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+    find_program(CCACHE_EXECUTABLE "ccache")
+    if (CCACHE_EXECUTABLE)
+        message(STATUS "Using ccache")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+    endif ()
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+    cmake_policy(SET CMP0135 NEW)
+endif ()
+
 project(tgi-trtllm-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 20)
 
@@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
-find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 
 #### External dependencies ####
 include(cmake/fmt.cmake)

diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
@@ -10,16 +10,17 @@ async-trait = "0.1"
 async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
+hashbrown = "0.14"
+hf-hub = { workspace = true }
 log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
-tokenizers = { version = "0.19", features = ["hf-hub"] }
-tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokenizers = { workspace = true }
+tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
-thiserror = "1.0.62"
+thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.24"
+tracing-opentelemetry = "0.25"
 tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
-parking_lot = "0.12"
 
 [build-dependencies]
 cmake = "0.1"

diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
@@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf};
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.5";
+const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
     let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
-    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");
 
     let mut install_path = PathBuf::from(install_path);
     if !install_path.is_absolute() {
@@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     (PathBuf::from(install_path), deps_folder)
 }
 
-fn build_ffi_layer(deps_folder: &PathBuf) {
+fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
+    let ndebug = match is_debug {
+        true => "1",
+        false => "0",
+    };
+
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
         .include("/usr/local/tensorrt/include")
         .file("src/ffi.cpp")
         .std("c++20")
+        .define("NDEBUG", ndebug)
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
+    println!("cargo:rerun-if-changed=cmake/json.cmake");
+    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
+    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
     println!("cargo:rerun-if-changed=include/backend.h");
     println!("cargo:rerun-if-changed=lib/backend.cpp");
     println!("cargo:rerun-if-changed=include/ffi.h");
@@ -115,7 +125,7 @@ fn main() {
     let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
 
     // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_folder);
+    build_ffi_layer(&deps_folder, is_debug);
 
     // Emit linkage search path
     probe!("ompi", MPI_REQUIRED_VERSION);

diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake
@@ -1,6 +1,6 @@
 FetchContent_Declare(
         fmt
-        GIT_REPOSITORY https://github.com/fmtlib/fmt
-        GIT_TAG 11.0.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
 )
 FetchContent_MakeAvailable(fmt)
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
@@ -1,5 +1,6 @@
 fetchcontent_declare(
         json
+        DOWNLOAD_EXTRACT_TIMESTAMP
         URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
 )
 fetchcontent_makeavailable(json)
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
@@ -11,7 +11,7 @@ endif ()
 
 fetchcontent_declare(
         spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
-        GIT_TAG v1.14.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
@@ -23,8 +23,9 @@ endif ()
 fetchcontent_declare(
         trtllm
         GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
-        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
         GIT_SHALLOW FALSE
+        DOWNLOAD_EXTRACT_TIMESTAMP
 )
 fetchcontent_makeavailable(trtllm)