huggingface
diff --git a/‎Cargo.lock
Lines changed: 504 additions & 421 deletions b/‎Cargo.lock
Lines changed: 504 additions & 421 deletions
diff --git a/‎Cargo.toml
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎Dockerfile_llamacpp
Lines changed: 76 additions & 0 deletions b/‎Dockerfile_llamacpp
Lines changed: 76 additions & 0 deletions
diff --git a/‎backends/llamacpp/Cargo.toml
Lines changed: 21 additions & 0 deletions b/‎backends/llamacpp/Cargo.toml
Lines changed: 21 additions & 0 deletions
diff --git a/‎backends/llamacpp/README.md
Lines changed: 24 additions & 0 deletions b/‎backends/llamacpp/README.md
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/llamacpp/build.rs
Lines changed: 48 additions & 0 deletions b/‎backends/llamacpp/build.rs
Lines changed: 48 additions & 0 deletions
diff --git a/‎backends/llamacpp/requirements.txt
Lines changed: 3 additions & 0 deletions b/‎backends/llamacpp/requirements.txt
Lines changed: 3 additions & 0 deletions
@@ -5,6 +5,7 @@ members = [
     "backends/v3",
     "backends/grpc-metadata",
     "backends/trtllm",
+    "backends/llamacpp",
     "launcher",
     "router"
 ]
 
@@ -0,0 +1,76 @@
+FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps
+
+ARG llamacpp_version=b4651
+ARG llamacpp_cuda=OFF
+ARG cuda_arch=75-real;80-real;86-real;89-real;90-real
+
+WORKDIR /opt/src
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt update && apt install -y \
+    clang \
+    cmake \
+    curl \
+    git \
+    python3-dev \
+    libssl-dev \
+    pkg-config \
+    tar
+
+ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/
+RUN tar -xzf ${llamacpp_version}.tar.gz \
+ && cd llama.cpp-${llamacpp_version} \
+ && cmake -B build \
+    -DCMAKE_INSTALL_PREFIX=/usr \
+    -DCMAKE_INSTALL_LIBDIR=/usr/lib \
+    -DCMAKE_C_COMPILER=clang \
+    -DCMAKE_CXX_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \
+    -DGGML_CUDA=${llamacpp_cuda} \
+    -DLLAMA_BUILD_COMMON=OFF \
+    -DLLAMA_BUILD_TESTS=OFF \
+    -DLLAMA_BUILD_EXAMPLES=OFF \
+    -DLLAMA_BUILD_SERVER=OFF \
+ && cmake --build build --parallel --config Release \
+ && cmake --install build
+
+WORKDIR /app
+COPY rust-toolchain.toml rust-toolchain.toml
+RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none
+ENV PATH="/root/.cargo/bin:$PATH"
+RUN cargo install cargo-chef --locked
+
+FROM deps AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+FROM deps AS builder
+COPY --from=planner /app/recipe.json recipe.json
+RUN cargo chef cook \
+    --recipe-path recipe.json \
+    --profile release-opt \
+    --package text-generation-router-llamacpp
+COPY . .
+RUN cargo build \
+    --profile release-opt \
+    --package text-generation-router-llamacpp --frozen
+
+FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04
+
+RUN apt update && apt install -y \
+    python3-venv \
+    python3-pip
+
+RUN python3 -m venv /venv
+ENV PATH="/venv/bin:$PATH"
+
+COPY backends/llamacpp/requirements.txt requirements.txt
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY --from=builder /usr/lib/libllama.so /usr/lib/
+COPY --from=builder /usr/lib/libggml*.so /usr/lib/
+COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+ENTRYPOINT ["text-generation-router-llamacpp"]
@@ -0,0 +1,21 @@
+[package]
+name = "text-generation-router-llamacpp"
+version.workspace = true
+edition.workspace = true
+authors.workspace = true
+homepage.workspace = true
+
+[build-dependencies]
+bindgen = "0.71.1"
+pkg-config = "0.3.31"
+
+[dependencies]
+async-trait = "0.1.85"
+clap = "4.5.27"
+num_cpus = "1.16.0"
+text-generation-router = { path = "../../router" }
+thiserror = "2.0.11"
+tokenizers.workspace = true
+tokio = "1.43.0"
+tokio-stream = "0.1.17"
+tracing = "0.1.41"
@@ -0,0 +1,24 @@
+# Llamacpp backend
+
+If all your dependencies are installed at the system level, running
+cargo build should be sufficient. However, if you want to experiment
+with different versions of llama.cpp, some additional setup is required.
+
+## Install llama.cpp
+
+    LLAMACPP_PREFIX=$(pwd)/llama.cpp.out
+
+    git clone https://github.com/ggerganov/llama.cpp
+    cd llama.cpp
+    cmake -B build \
+        -DCMAKE_INSTALL_PREFIX="$LLAMACPP_PREFIX" \
+        -DLLAMA_BUILD_COMMON=OFF \
+        -DLLAMA_BUILD_TESTS=OFF \
+        -DLLAMA_BUILD_EXAMPLES=OFF \
+        -DLLAMA_BUILD_SERVER=OFF
+    cmake --build build --config Release -j
+    cmake --install build
+
+## Build TGI
+
+    PKG_CONFIG_PATH="$LLAMACPP_PREFIX/lib/pkgconfig" cargo build
@@ -0,0 +1,48 @@
+use bindgen::callbacks::{ItemInfo, ParseCallbacks};
+use std::env;
+use std::path::PathBuf;
+
+#[derive(Debug)]
+struct PrefixStripper;
+
+impl ParseCallbacks for PrefixStripper {
+    fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option<String> {
+        item_info.name.strip_prefix("llama_").map(str::to_string)
+    }
+}
+
+fn main() {
+    if let Some(cuda_version) = option_env!("CUDA_VERSION") {
+        let mut version: Vec<&str> = cuda_version.split('.').collect();
+        if version.len() > 2 {
+            version.pop();
+        }
+        let cuda_version = format!("cuda-{}", version.join("."));
+        pkg_config::Config::new().probe(&cuda_version).unwrap();
+    }
+    let llama = pkg_config::Config::new().probe("llama").unwrap();
+
+    for path in &llama.link_paths {
+        println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display());
+    }
+    println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags");
+
+    let bindings = bindgen::Builder::default()
+        .clang_args(
+            llama
+                .include_paths
+                .iter()
+                .map(|p| format!("-I{}", p.display())),
+        )
+        .header_contents("llama_bindings.h", "#include <llama.h>")
+        .prepend_enum_name(false)
+        .parse_callbacks(Box::new(PrefixStripper))
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks::new()))
+        .generate()
+        .expect("Unable to generate bindings");
+
+    let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
+    bindings
+        .write_to_file(out_path.join("llamacpp.rs"))
+        .expect("Couldn't write bindings!");
+}
@@ -0,0 +1,3 @@
+transformers==4.48.2
+huggingface-hub==0.28.1
+hf-transfer==0.1.9
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ members = [`
`5`	`5`	`"backends/v3",`
`6`	`6`	`"backends/grpc-metadata",`
`7`	`7`	`"backends/trtllm",`
	`8`	`+ "backends/llamacpp",`
`8`	`9`	`"launcher",`
`9`	`10`	`"router"`
`10`	`11`	`]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+transformers==4.48.2`
	`2`	`+huggingface-hub==0.28.1`
	`3`	`+hf-transfer==0.1.9`