From 95e221eeceb49aaf5118c60c3165c9df320b4efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 24 Jan 2025 09:05:37 +0000 Subject: [PATCH 01/63] Add llamacpp backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Cargo.lock | 924 +++++++++++++++------------ Cargo.toml | 1 + Dockerfile_llamacpp | 77 +++ backends/llamacpp/.cargo/config.toml | 2 + backends/llamacpp/Cargo.toml | 20 + backends/llamacpp/build.rs | 20 + backends/llamacpp/src/backend.rs | 434 +++++++++++++ backends/llamacpp/src/main.rs | 210 ++++++ backends/llamacpp/src/wrapper.h | 1 + 9 files changed, 1268 insertions(+), 421 deletions(-) create mode 100644 Dockerfile_llamacpp create mode 100644 backends/llamacpp/.cargo/config.toml create mode 100644 backends/llamacpp/Cargo.toml create mode 100644 backends/llamacpp/build.rs create mode 100644 backends/llamacpp/src/backend.rs create mode 100644 backends/llamacpp/src/main.rs create mode 100644 backends/llamacpp/src/wrapper.h diff --git a/Cargo.lock b/Cargo.lock index 915de0d582e..73ed43c6479 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,7 +24,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", - "getrandom", + "getrandom 0.2.15", "once_cell", "serde", "version_check", @@ -48,9 +48,9 @@ checksum = "4aa90d7ce82d4be67b64039a3d588d38dbcc6736577de4a847025ce5b0c468d1" [[package]] name = "allocator-api2" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45862d1c77f2228b9e10bc609d5bc203d86ebc9b87ad8d5d5167a6c9abf739d9" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" [[package]] name = "android-tzdata" @@ -108,19 +108,20 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "3.0.6" +version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", + "once_cell", "windows-sys 0.59.0", ] [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" [[package]] name = "arbitrary" @@ -142,7 +143,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -181,18 +182,18 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "async-trait" -version = "0.1.83" +version = "0.1.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" +checksum = "3f934833b4b7233644e5848f235df3f57ed8c80f1528a26c3dfa13d2147fa056" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -266,28 +267,26 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.11.0" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe7c2840b66236045acd2607d5866e274380afd87ef99d6226e961e2cb47df45" +checksum = "4c2b7ddaa2c56a367ad27a094ad8ef4faacf8a617c2575acb2ba88949df999ca" dependencies = [ "aws-lc-sys", - "mirai-annotations", "paste", "zeroize", ] [[package]] name = "aws-lc-sys" -version = "0.23.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad3a619a9de81e1d7de1f1186dcba4506ed661a0e483d84410fdef0ee87b2f96" +checksum = "71b2ddd3ada61a305e1d8bb6c005d1eaa7d14d903681edfc400406d523a9b491" dependencies = [ - "bindgen", + "bindgen 0.69.5", "cc", "cmake", "dunce", "fs_extra", - "libc", "paste", ] @@ -304,7 +303,7 @@ dependencies = [ "futures-util", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.31", + "hyper 0.14.32", "itoa", "matchit", "memchr", @@ -333,10 +332,10 @@ dependencies = [ "axum-core 0.4.5", "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.1", + "hyper 1.6.0", "hyper-util", "itoa", "matchit", @@ -351,7 +350,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper 1.0.2", "tokio", - "tower 0.5.1", + "tower 0.5.2", "tower-layer", "tower-service", "tracing", @@ -383,7 +382,7 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "http-body-util", "mime", @@ -404,7 +403,7 @@ dependencies = [ "axum 0.7.9", "futures-core", "futures-util", - "http 1.1.0", + "http 1.2.0", "opentelemetry 0.21.0", "pin-project-lite", "tower 0.4.13", @@ -452,7 +451,7 @@ version = "0.69.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "cexpr", "clang-sys", "itertools 0.12.1", @@ -463,12 +462,32 @@ dependencies = [ "proc-macro2", "quote", "regex", - "rustc-hash", + "rustc-hash 1.1.0", "shlex", - "syn 2.0.89", + "syn 2.0.96", "which", ] +[[package]] +name = "bindgen" +version = "0.71.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f58bf3d7db68cfbac37cfc485a8d711e87e064c3d0fe0435b92f7a407f9d6b3" +dependencies = [ + "bitflags 2.8.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash 2.1.0", + "shlex", + "syn 2.0.96", +] + [[package]] name = "bit-set" version = "0.8.0" @@ -498,9 +517,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.6.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" [[package]] name = "bitstream-io" @@ -531,9 +550,9 @@ checksum = "c360505aed52b7ec96a3636c3f039d99103c37d1d9b4f7a8c743d3ea9ffcd03b" [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "bytecount" @@ -543,9 +562,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.20.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a" +checksum = "ef657dfab802224e671f5818e9a4935f9b1957ed18e58292690cc39e7a4092a3" [[package]] name = "byteorder" @@ -561,9 +580,9 @@ checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" [[package]] name = "bytes" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" [[package]] name = "camino" @@ -576,9 +595,9 @@ dependencies = [ [[package]] name = "cargo-platform" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24b1f0365a6c6bb4020cd05806fd0d33c44d38046b8bd7f0e40814b9763cabfc" +checksum = "e35af189006b9c0f00a064685c727031e3ed2d8020f7ba284d78cc2671bd36ea" dependencies = [ "serde", ] @@ -594,7 +613,7 @@ dependencies = [ "semver", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -620,9 +639,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.1" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" dependencies = [ "jobserver", "libc", @@ -704,9 +723,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.21" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796" dependencies = [ "clap_builder", "clap_derive", @@ -714,9 +733,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" dependencies = [ "anstream", "anstyle", @@ -726,27 +745,27 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.18" +version = "4.5.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "cmake" -version = "0.1.51" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +checksum = "e24a03c8b52922d68a1589ad61032f2c1aa5a8158d2aa0d93c6e9534944bbad6" dependencies = [ "cc", ] @@ -775,9 +794,9 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "compact_str" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6050c3a16ddab2e412160b31f2c871015704239bca62f72f6e5f0be631d3f644" +checksum = "3b79c4069c6cad78e2e0cdfcbd26275770669fb39fd308a752dc110e83b9af32" dependencies = [ "castaway", "cfg-if", @@ -789,15 +808,15 @@ dependencies = [ [[package]] name = "console" -version = "0.15.8" +version = "0.15.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b" dependencies = [ "encode_unicode", - "lazy_static", "libc", - "unicode-width 0.1.14", - "windows-sys 0.52.0", + "once_cell", + "unicode-width 0.2.0", + "windows-sys 0.59.0", ] [[package]] @@ -828,9 +847,9 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "cpufeatures" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b80225097f2e5ae4e7179dd2266824648f3e2f49d9134d584b76389d31c4c3" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] @@ -882,18 +901,18 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "06ba6d68e24814cb8de6bb986db8222d3a027d15872cabc0d18817bc3c0e4471" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-deque" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", @@ -910,9 +929,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.20" +version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crossterm" @@ -920,7 +939,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "crossterm_winapi", "mio", "parking_lot", @@ -941,9 +960,9 @@ dependencies = [ [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -988,46 +1007,61 @@ dependencies = [ [[package]] name = "cxx" -version = "1.0.130" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23c042a0ba58aaff55299632834d1ea53ceff73d62373f62c9ae60890ad1b942" +checksum = "0fc894913dccfed0f84106062c284fa021c3ba70cb1d78797d6f5165d4492e45" dependencies = [ "cc", + "cxxbridge-cmd", "cxxbridge-flags", "cxxbridge-macro", + "foldhash", "link-cplusplus", ] [[package]] name = "cxx-build" -version = "1.0.130" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45dc1c88d0fdac57518a9b1f6c4f4fb2aca8f3c30c0d03d7d8518b47ca0bcea6" +checksum = "503b2bfb6b3e8ce7f95d865a67419451832083d3186958290cee6c53e39dfcfe" dependencies = [ "cc", "codespan-reporting", "proc-macro2", "quote", "scratch", - "syn 2.0.89", + "syn 2.0.96", +] + +[[package]] +name = "cxxbridge-cmd" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0d2cb64a95b4b5a381971482235c4db2e0208302a962acdbe314db03cbbe2fb" +dependencies = [ + "clap 4.5.27", + "codespan-reporting", + "proc-macro2", + "quote", + "syn 2.0.96", ] [[package]] name = "cxxbridge-flags" -version = "1.0.130" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa7ed7d30b289e2592cc55bc2ccd89803a63c913e008e6eb59f06cddf45bb52f" +checksum = "5f797b0206463c9c2a68ed605ab28892cca784f1ef066050f4942e3de26ad885" [[package]] name = "cxxbridge-macro" -version = "1.0.130" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b8c465d22de46b851c04630a5fc749a26005b263632ed2e0d9cc81518ead78d" +checksum = "e79010a2093848e65a3e0f7062d3f02fb2ef27f866416dfe436fccfa73d3bb59" dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1051,7 +1085,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1062,7 +1096,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1092,7 +1126,7 @@ dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1102,15 +1136,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.89", + "syn 2.0.96", ] -[[package]] -name = "diff" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" - [[package]] name = "digest" version = "0.10.7" @@ -1150,7 +1178,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1161,9 +1189,9 @@ checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" [[package]] name = "easy-cast" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6" +checksum = "72852736692ec862655eca398c9bb1b476161b563c9f80f45f4808b9629750d6" dependencies = [ "libm", ] @@ -1185,9 +1213,9 @@ dependencies = [ [[package]] name = "encode_unicode" -version = "0.3.6" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" [[package]] name = "encoding_rs" @@ -1206,12 +1234,12 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.9" +version = "0.3.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" +checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1251,15 +1279,15 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "fdeflate" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c6f4c64c1d33a3111c4466f7365ebdcc37c5bd1ea0d62aae2e3d722aacbedb" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" dependencies = [ "simd-adler32", ] @@ -1311,9 +1339,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "foldhash" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" [[package]] name = "foreign-types" @@ -1411,7 +1439,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -1471,7 +1499,19 @@ checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets 0.52.6", ] [[package]] @@ -1492,9 +1532,9 @@ checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "grpc-metadata" @@ -1518,7 +1558,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.6.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -1536,8 +1576,8 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http 1.1.0", - "indexmap 2.6.0", + "http 1.2.0", + "indexmap 2.7.1", "slab", "tokio", "tokio-util", @@ -1577,9 +1617,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.1" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" dependencies = [ "allocator-api2", "equivalent", @@ -1629,18 +1669,18 @@ dependencies = [ "reqwest 0.11.27", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "ureq", ] [[package]] name = "home" -version = "0.5.9" +version = "0.5.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +checksum = "589533453244b0995c858700322199b2becb13b627df2851f64a2775d024abcf" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1667,9 +1707,9 @@ dependencies = [ [[package]] name = "http" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +checksum = "f16ca2af56261c99fba8bac40a10251ce8188205a4c448fbb745a2e4daa76fea" dependencies = [ "bytes", "fnv", @@ -1694,7 +1734,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http 1.2.0", ] [[package]] @@ -1705,16 +1745,16 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "pin-project-lite", ] [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -1724,9 +1764,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hyper" -version = "0.14.31" +version = "0.14.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" dependencies = [ "bytes", "futures-channel", @@ -1748,15 +1788,15 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97818827ef4f364230e16705d4706e2897df2bb60617d6ca15d598025a3c481f" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", "futures-util", "h2 0.4.7", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "httparse", "httpdate", @@ -1769,16 +1809,16 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.3" +version = "0.27.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", - "http 1.1.0", - "hyper 1.5.1", + "http 1.2.0", + "hyper 1.6.0", "hyper-util", "log", - "rustls 0.23.17", + "rustls 0.23.21", "rustls-native-certs", "rustls-pki-types", "tokio", @@ -1792,7 +1832,7 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" dependencies = [ - "hyper 0.14.31", + "hyper 0.14.32", "pin-project-lite", "tokio", "tokio-io-timeout", @@ -1805,7 +1845,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper 0.14.31", + "hyper 0.14.32", "native-tls", "tokio", "tokio-native-tls", @@ -1820,9 +1860,9 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.1", + "hyper 1.6.0", "pin-project-lite", "socket2", "tokio", @@ -1968,7 +2008,7 @@ checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -2023,9 +2063,9 @@ dependencies = [ [[package]] name = "image-webp" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e031e8e3d94711a9ccb5d6ea357439ef3dcbed361798bd4071dc4d9793fbe22f" +checksum = "b77d01e822461baa8409e156015a1d91735549f0f2c17691bd2d996bef238f7f" dependencies = [ "byteorder-lite", "quick-error", @@ -2049,20 +2089,20 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "serde", ] [[package]] name = "indicatif" -version = "0.17.9" +version = "0.17.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" dependencies = [ "console", "number_prefix", @@ -2085,23 +2125,22 @@ checksum = "94bd26b1b737bc11f183620072e188d1c6ede67e0e78682228d66b49ec510e17" dependencies = [ "opentelemetry 0.20.0", "opentelemetry-otlp", - "thiserror", + "thiserror 1.0.69", "tracing", "tracing-opentelemetry 0.21.0", ] [[package]] name = "instability" -version = "0.3.3" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b829f37dead9dc39df40c2d3376c179fdfd2ac771f53f55d3c30dc096a3c0c6e" +checksum = "0bf9fed6d91cfb734e7476a06bde8300a1b94e217e1b523b6f0cd1a01998c71d" dependencies = [ "darling", "indoc", - "pretty_assertions", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -2112,14 +2151,14 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" @@ -2165,9 +2204,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "jobserver" @@ -2186,18 +2225,19 @@ checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" dependencies = [ + "once_cell", "wasm-bindgen", ] [[package]] name = "jsonschema" -version = "0.28.0" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d8eb539cdb4222da29bb658cc9881aa2477b33fb1a74c5c31450395fc1a4b2" +checksum = "4b8f66fe41fa46a5c83ed1c717b7e0b4635988f427083108c8cf0a882cc13441" dependencies = [ "ahash", "base64 0.22.1", @@ -2212,7 +2252,7 @@ dependencies = [ "percent-encoding", "referencing", "regex-syntax 0.8.5", - "reqwest 0.12.9", + "reqwest 0.12.12", "serde", "serde_json", "uuid-simd", @@ -2244,9 +2284,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libfuzzer-sys" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa" +checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" dependencies = [ "arbitrary", "cc", @@ -2254,9 +2294,9 @@ dependencies = [ [[package]] name = "libloading" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" +checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", "windows-targets 0.52.6", @@ -2274,7 +2314,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "libc", ] @@ -2289,15 +2329,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.14" +version = "0.4.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "litemap" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" [[package]] name = "lock_api" @@ -2311,9 +2351,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.22" +version = "0.4.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" [[package]] name = "loop9" @@ -2330,7 +2370,7 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown 0.15.1", + "hashbrown 0.15.2", ] [[package]] @@ -2413,15 +2453,15 @@ checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6" dependencies = [ "base64 0.22.1", "http-body-util", - "hyper 1.5.1", + "hyper 1.6.0", "hyper-rustls", "hyper-util", - "indexmap 2.6.0", + "indexmap 2.7.1", "ipnet", "metrics", "metrics-util", "quanta", - "thiserror", + "thiserror 1.0.69", "tokio", "tracing", ] @@ -2459,9 +2499,9 @@ dependencies = [ [[package]] name = "minijinja" -version = "2.5.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c37e1b517d1dcd0e51dc36c4567b9d5a29262b3ec8da6cb5d35e27a8fb529b5" +checksum = "cff7b8df5e85e30b87c2b0b3f58ba3a87b68e133738bf512a7713769326dbca9" dependencies = [ "serde", "serde_json", @@ -2469,9 +2509,9 @@ dependencies = [ [[package]] name = "minijinja-contrib" -version = "2.5.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe51f1a6a8285f03fcd1544d834234fe8db285f29e1c2253600c93b3ae19242" +checksum = "7ac3e47a9006ed0500425a092c9f8b2e56d10f8aeec8ce870c5e8a7c6ef2d7c3" dependencies = [ "minijinja", "serde", @@ -2485,9 +2525,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "b8402cab7aefae129c6977bb0ff1b8fd9a04eb5b51efc50a70bea51cda0c7924" dependencies = [ "adler2", "simd-adler32", @@ -2495,23 +2535,16 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.2" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ - "hermit-abi 0.3.9", "libc", "log", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.52.0", ] -[[package]] -name = "mirai-annotations" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1" - [[package]] name = "monostate" version = "0.1.13" @@ -2530,7 +2563,7 @@ checksum = "a7ce64b975ed4f123575d11afd9491f2e37bbd5813fbfbc0f09ae1fbddea74e0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -2552,7 +2585,7 @@ dependencies = [ "futures", "pin-project", "rand", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-util", "tracing", @@ -2560,9 +2593,9 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" +checksum = "0dab59f8e050d5df8e4dd87d9206fb6f65a483e20ac9fda365ade4fab353196c" dependencies = [ "libc", "log", @@ -2596,7 +2629,7 @@ dependencies = [ "bytes", "futures", "hostname", - "hyper 0.14.31", + "hyper 0.14.32", "muxado", "once_cell", "parking_lot", @@ -2604,7 +2637,7 @@ dependencies = [ "rustls-pemfile", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-retry", "tokio-util", @@ -2618,7 +2651,7 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab2156c4fce2f8df6c499cc1c763e4394b7482525bf2a9701c9d79d215f519e4" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "cfg-if", "cfg_aliases 0.1.1", "libc", @@ -2630,7 +2663,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "cfg-if", "cfg_aliases 0.2.1", "libc", @@ -2730,7 +2763,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -2801,9 +2834,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] @@ -2844,11 +2877,11 @@ checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" [[package]] name = "openssl" -version = "0.10.68" +version = "0.10.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6174bc48f102d208783c2c84bf931bb75927a617866870de8a4ea85597f871f5" +checksum = "f5e534d133a060a3c19daec1eb3e98ec6f4685978834f2dbadfe2ec215bab64e" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "cfg-if", "foreign-types", "libc", @@ -2865,14 +2898,14 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-sys" @@ -2904,11 +2937,11 @@ checksum = "1e32339a5dc40459130b3bd269e9892439f55b33e772d2a9d402a789baaf4e8a" dependencies = [ "futures-core", "futures-sink", - "indexmap 2.6.0", + "indexmap 2.7.1", "js-sys", "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", "urlencoding", ] @@ -2926,7 +2959,7 @@ dependencies = [ "opentelemetry_api", "opentelemetry_sdk 0.20.0", "prost 0.11.9", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic 0.9.2", ] @@ -2964,7 +2997,7 @@ dependencies = [ "js-sys", "once_cell", "pin-project-lite", - "thiserror", + "thiserror 1.0.69", "urlencoding", ] @@ -2986,7 +3019,7 @@ dependencies = [ "rand", "regex", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "tokio-stream", ] @@ -3005,10 +3038,10 @@ dependencies = [ "glob", "once_cell", "opentelemetry 0.21.0", - "ordered-float 4.5.0", + "ordered-float 4.6.0", "percent-encoding", "rand", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -3028,9 +3061,9 @@ dependencies = [ [[package]] name = "ordered-float" -version = "4.5.0" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c65ee1f9701bf938026630b455d5315f490640234259037edb259798b3bcf85e" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" dependencies = [ "num-traits", ] @@ -3048,9 +3081,9 @@ dependencies = [ [[package]] name = "outref" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "overload" @@ -3111,34 +3144,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap 2.6.0", + "indexmap 2.7.1", ] [[package]] name = "pin-project" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be57f64e946e500c8ee36ef6331845d40a93055567ec57e8fae13efd33759b95" +checksum = "1e2ec53ad785f4d35dac0adea7f7dc6f1bb277ad84a680c7afefeae05d1f5916" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c0f5fad0874fc7abcd4d750e76917eaebbecaa2c20bde22e1dbeeba8beb758c" +checksum = "d56a66c0c55993aa927429d0f8a0abfd74f084e4d9c192cffed01e418d83eefb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "pin-project-lite" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -3182,9 +3215,9 @@ dependencies = [ [[package]] name = "png" -version = "0.17.14" +version = "0.17.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52f9d46a34a05a6a57566bc2bfae066ef07585a6e3fa30fbbdff5936380623f0" +checksum = "82151a2fc869e011c153adc57cf2789ccb8d9906ce52c0b39a6b5697749d7526" dependencies = [ "bitflags 1.3.2", "crc32fast", @@ -3195,9 +3228,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" +checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" [[package]] name = "powerfmt" @@ -3214,24 +3247,14 @@ dependencies = [ "zerocopy", ] -[[package]] -name = "pretty_assertions" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae130e2f271fbc2ac3a40fb1d07180839cdbbe443c7a27e1e3c13c5cac0116d" -dependencies = [ - "diff", - "yansi", -] - [[package]] name = "prettyplease" -version = "0.2.25" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" +checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -3260,9 +3283,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.92" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" dependencies = [ "unicode-ident", ] @@ -3283,7 +3306,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a65f2e60fbf1063868558d69c6beacf412dc755f9fc020f514b7955fc914fe30" dependencies = [ "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -3323,7 +3346,7 @@ dependencies = [ "prost 0.12.6", "prost-types", "regex", - "syn 2.0.89", + "syn 2.0.96", "tempfile", ] @@ -3350,7 +3373,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -3409,7 +3432,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -3422,7 +3445,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -3436,15 +3459,15 @@ dependencies = [ [[package]] name = "quanta" -version = "0.12.3" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5167a477619228a0b284fac2674e3c388cba90631d7b7de620e6f1fcd08da5" +checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e" dependencies = [ "crossbeam-utils", "libc", "once_cell", "raw-cpuid", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", "web-sys", "winapi", ] @@ -3457,9 +3480,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" [[package]] name = "quote" -version = "1.0.37" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" dependencies = [ "proc-macro2", ] @@ -3491,7 +3514,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -3500,7 +3523,7 @@ version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdef7f9be5c0122f890d58bdf4d964349ba6a6161f705907526d891efabba57d" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "cassowary", "compact_str", "crossterm", @@ -3545,7 +3568,7 @@ dependencies = [ "rand_chacha", "simd_helpers", "system-deps", - "thiserror", + "thiserror 1.0.69", "v_frame", "wasm-bindgen", ] @@ -3567,11 +3590,11 @@ dependencies = [ [[package]] name = "raw-cpuid" -version = "11.2.0" +version = "11.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab240315c661615f2ee9f0f2cd32d5a7343a84d5ebcccb99d46e6637565e7b0" +checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", ] [[package]] @@ -3607,11 +3630,11 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", ] [[package]] @@ -3620,9 +3643,9 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ - "getrandom", + "getrandom 0.2.15", "libredox", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -3642,14 +3665,14 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "referencing" -version = "0.28.0" +version = "0.28.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "093a875008827c0ae15c746189966e162faa05bf347719d06302c548ac63630f" +checksum = "d0dcb5ab28989ad7c91eb1b9531a37a1a137cc69a0499aee4117cae4a107c464" dependencies = [ "ahash", "fluent-uri", @@ -3716,7 +3739,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.31", + "hyper 0.14.32", "hyper-tls", "ipnet", "js-sys", @@ -3744,19 +3767,19 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.9" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" +checksum = "43e734407157c3c2034e0258f5e4473ddb361b1e85f95a66690d67264d7cd1da" dependencies = [ "base64 0.22.1", "bytes", "futures-channel", "futures-core", "futures-util", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.1", + "hyper 1.6.0", "hyper-util", "ipnet", "js-sys", @@ -3770,6 +3793,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper 1.0.2", "tokio", + "tower 0.5.2", "tower-service", "url", "wasm-bindgen", @@ -3807,7 +3831,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom", + "getrandom 0.2.15", "libc", "spin 0.9.8", "untrusted 0.9.0", @@ -3834,7 +3858,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.89", + "syn 2.0.96", "walkdir", ] @@ -3860,6 +3884,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497" + [[package]] name = "rustc_version" version = "0.4.1" @@ -3871,15 +3901,15 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.41" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3910,9 +3940,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.17" +version = "0.23.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f1a745511c54ba6d4465e8d5dfbd81b45791756de28d4981af70d6dca128f1e" +checksum = "8f287924602bf649d949c63dc8ac8b235fa5387d394020705b80c4eb597ce5b8" dependencies = [ "aws-lc-rs", "log", @@ -3932,7 +3962,7 @@ dependencies = [ "openssl-probe", "rustls-pki-types", "schannel", - "security-framework 3.0.1", + "security-framework 3.2.0", ] [[package]] @@ -3946,9 +3976,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" [[package]] name = "rustls-webpki" @@ -3964,15 +3994,15 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -4020,7 +4050,7 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "core-foundation 0.9.4", "core-foundation-sys", "libc", @@ -4029,11 +4059,11 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.0.1" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1415a607e92bec364ea2cf9264646dcce0f91e6d65281bd6f2819cca3bf39c8" +checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "core-foundation 0.10.0", "core-foundation-sys", "libc", @@ -4042,9 +4072,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.12.1" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa39c7303dc58b5543c94d22c1766b0d31f2ee58306363ea622b10bbc075eaa2" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" dependencies = [ "core-foundation-sys", "libc", @@ -4052,18 +4082,18 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.23" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" dependencies = [ "serde", ] [[package]] name = "serde" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] @@ -4090,22 +4120,22 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ - "indexmap 2.6.0", + "indexmap 2.7.1", "itoa", "memchr", "ryu", @@ -4246,9 +4276,9 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +checksum = "c970269d99b64e60ec3bd6ad27270092a5394c4e309314b18ae3fe575695fbe8" dependencies = [ "libc", "windows-sys 0.52.0", @@ -4315,7 +4345,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -4337,9 +4367,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.89" +version = "2.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" dependencies = [ "proc-macro2", "quote", @@ -4369,7 +4399,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -4453,12 +4483,13 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.14.0" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" +checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" dependencies = [ "cfg-if", "fastrand", + "getrandom 0.3.1", "once_cell", "rustix", "windows-sys 0.59.0", @@ -4478,16 +4509,16 @@ name = "text-generation-backends-trtllm" version = "3.1.1-dev0" dependencies = [ "async-trait", - "clap 4.5.21", + "clap 4.5.27", "cmake", "cxx", "cxx-build", - "hashbrown 0.15.1", + "hashbrown 0.15.2", "hf-hub", "pkg-config", "pyo3", "text-generation-router", - "thiserror", + "thiserror 1.0.69", "tokenizers", "tokio", "tokio-stream", @@ -4499,7 +4530,7 @@ name = "text-generation-benchmark" version = "3.1.1-dev0" dependencies = [ "average", - "clap 4.5.21", + "clap 4.5.27", "float-ord", "hf-hub", "ratatui", @@ -4507,7 +4538,7 @@ dependencies = [ "serde_json", "tabled", "text-generation-client", - "thiserror", + "thiserror 1.0.69", "tokenizers", "tokio", "tracing", @@ -4524,7 +4555,7 @@ dependencies = [ "grpc-metadata", "prost 0.12.6", "prost-build", - "thiserror", + "thiserror 1.0.69", "tokio", "tonic 0.10.2", "tonic-build", @@ -4536,7 +4567,7 @@ dependencies = [ name = "text-generation-launcher" version = "3.1.1-dev0" dependencies = [ - "clap 4.5.21", + "clap 4.5.27", "ctrlc", "float_eq", "hf-hub", @@ -4547,7 +4578,7 @@ dependencies = [ "reqwest 0.11.27", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tracing", "tracing-subscriber", "vergen", @@ -4564,7 +4595,7 @@ dependencies = [ "axum-tracing-opentelemetry", "base64 0.22.1", "chrono", - "clap 4.5.21", + "clap 4.5.27", "csv", "futures", "futures-util", @@ -4590,7 +4621,7 @@ dependencies = [ "serde", "serde_json", "sysinfo", - "thiserror", + "thiserror 1.0.69", "tokenizers", "tokio", "tokio-stream", @@ -4605,6 +4636,22 @@ dependencies = [ "vergen", ] +[[package]] +name = "text-generation-router-llamacpp" +version = "3.0.2-dev0" +dependencies = [ + "async-trait", + "bindgen 0.71.1", + "clap 4.5.27", + "pkg-config", + "text-generation-router", + "thiserror 2.0.11", + "tokenizers", + "tokio", + "tokio-stream", + "tracing", +] + [[package]] name = "text-generation-router-v2" version = "3.1.1-dev0" @@ -4614,7 +4661,7 @@ dependencies = [ "axum 0.7.9", "axum-tracing-opentelemetry", "base64 0.22.1", - "clap 4.5.21", + "clap 4.5.27", "futures", "futures-util", "grpc-metadata", @@ -4639,7 +4686,7 @@ dependencies = [ "serde_json", "slotmap", "text-generation-router", - "thiserror", + "thiserror 1.0.69", "tokenizers", "tokio", "tokio-stream", @@ -4663,7 +4710,7 @@ dependencies = [ "axum 0.7.9", "axum-tracing-opentelemetry", "base64 0.22.1", - "clap 4.5.21", + "clap 4.5.27", "criterion", "futures", "futures-util", @@ -4690,7 +4737,7 @@ dependencies = [ "serde_json", "slotmap", "text-generation-router", - "thiserror", + "thiserror 1.0.69", "tokenizers", "tokio", "tokio-stream", @@ -4720,7 +4767,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl 2.0.11", ] [[package]] @@ -4731,7 +4787,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", ] [[package]] @@ -4757,9 +4824,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.36" +version = "0.3.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" +checksum = "35e7868883861bd0e56d9ac6efcaaca0d6d5d82a2a7ec8209ff492c07cf37b21" dependencies = [ "deranged", "itoa", @@ -4780,9 +4847,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" +checksum = "2834e6017e3e5e4b9834939793b282bc03b37a3336245fa820e35e233e2a85de" dependencies = [ "num-conv", "time-core", @@ -4810,14 +4877,14 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.20.3" +version = "0.20.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b67c92f6d705e2a1d106fb0b28c696f9074901a9c656ee5d9f5de204c39bf7" +checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" dependencies = [ "aho-corasick", "derive_builder", "esaxx-rs", - "getrandom", + "getrandom 0.2.15", "hf-hub", "indicatif", "itertools 0.12.1", @@ -4835,7 +4902,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror", + "thiserror 1.0.69", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -4877,7 +4944,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -4903,12 +4970,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "5f6d0975eaace0cf0fcadee4e4aaa5da15b5c079146f2cffb67c113be122bf37" dependencies = [ - "rustls 0.23.17", - "rustls-pki-types", + "rustls 0.23.21", "tokio", ] @@ -4925,9 +4991,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.12" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" +checksum = "d7fcaa8d55a2bdd6b83ace262b016eca0d79ee02818c5c1bcdf0305114081078" dependencies = [ "bytes", "futures-core", @@ -4964,7 +5030,7 @@ version = "0.22.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" dependencies = [ - "indexmap 2.6.0", + "indexmap 2.7.1", "serde", "serde_spanned", "toml_datetime", @@ -4986,7 +5052,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.31", + "hyper 0.14.32", "hyper-timeout", "percent-encoding", "pin-project", @@ -5013,7 +5079,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.31", + "hyper 0.14.32", "hyper-timeout", "percent-encoding", "pin-project", @@ -5036,7 +5102,7 @@ dependencies = [ "proc-macro2", "prost-build", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -5061,14 +5127,14 @@ dependencies = [ [[package]] name = "tower" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2873938d487c3cfb9aed7546dc9f2711d867c9f90c46b889989a2cb84eba6b4f" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", "pin-project-lite", - "sync_wrapper 0.1.2", + "sync_wrapper 1.0.2", "tokio", "tower-layer", "tower-service", @@ -5081,9 +5147,9 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" dependencies = [ - "bitflags 2.6.0", + "bitflags 2.8.0", "bytes", - "http 1.1.0", + "http 1.2.0", "http-body 1.0.1", "http-body-util", "pin-project-lite", @@ -5105,9 +5171,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" -version = "0.1.40" +version = "0.1.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" dependencies = [ "log", "pin-project-lite", @@ -5117,20 +5183,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "tracing-core" -version = "0.1.32" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" dependencies = [ "once_cell", "valuable", @@ -5198,7 +5264,7 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9920abb6a3ee3a2af7d30c9ff02900f8481935d36723c3da95cf807468218e8c" dependencies = [ - "http 1.1.0", + "http 1.2.0", "opentelemetry 0.21.0", "tracing", "tracing-opentelemetry 0.22.0", @@ -5206,9 +5272,9 @@ dependencies = [ [[package]] name = "tracing-serde" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" dependencies = [ "serde", "tracing-core", @@ -5216,9 +5282,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" dependencies = [ "matchers", "nu-ansi-term", @@ -5249,15 +5315,15 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicase" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e51b68083f157f853b6379db119d1c1be0e6e4dec98101079dec41f6f5cf6df" +checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-normalization-alignments" @@ -5343,9 +5409,9 @@ dependencies = [ [[package]] name = "url" -version = "2.5.3" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d157f1b96d14500ffdc1f10ba712e780825526c03d9a49b4d0324b0d9113ada" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -5382,7 +5448,7 @@ version = "4.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5afb1a60e207dca502682537fefcfd9921e71d0b83e9576060f09abc6efab23" dependencies = [ - "indexmap 2.6.0", + "indexmap 2.7.1", "serde", "serde_json", "utoipa-gen", @@ -5398,7 +5464,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -5419,24 +5485,24 @@ dependencies = [ [[package]] name = "uuid" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ - "getrandom", + "getrandom 0.2.15", "rand", "uuid-macro-internal", ] [[package]] name = "uuid-macro-internal" -version = "1.11.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b91f57fe13a38d0ce9e28a03463d8d3c2468ed03d75375110ec71d93b449a08" +checksum = "f8a86d88347b61a0e17b9908a67efcc594130830bf1045653784358dd023e294" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -5463,9 +5529,9 @@ dependencies = [ [[package]] name = "valuable" -version = "0.1.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" [[package]] name = "vcpkg" @@ -5532,49 +5598,59 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" dependencies = [ "cfg-if", "once_cell", + "rustversion", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.45" +version = "0.4.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc7ec4f8827a71586374db3e87abdb5a2bb3a15afed140221307c3ec06b1f63b" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" dependencies = [ "cfg-if", "js-sys", + "once_cell", "wasm-bindgen", "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5582,28 +5658,31 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" dependencies = [ "js-sys", "wasm-bindgen", @@ -5962,9 +6041,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.20" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310" dependencies = [ "memchr", ] @@ -5979,6 +6058,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.8.0", +] + [[package]] name = "write16" version = "1.0.0" @@ -5991,17 +6079,11 @@ version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" -[[package]] -name = "yansi" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" - [[package]] name = "yoke" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c5b1314b079b0930c31e3af543d8ee1757b1951ae1e1565ec704403a7240ca5" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" dependencies = [ "serde", "stable_deref_trait", @@ -6011,13 +6093,13 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28cc31741b18cb6f1d5ff12f5b7523e3d6eb0852bbbad19d73905511d9849b95" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", "synstructure", ] @@ -6039,27 +6121,27 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] name = "zerofrom" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91ec111ce797d0e0784a1116d0ddcdbea84322cd79e5d5ad173daeba4f93ab55" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ea7b4a3637ea8669cedf0f1fd5c286a17f3de97b8dd5a70a6c167a1730e63a5" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", "synstructure", ] @@ -6088,7 +6170,7 @@ checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.89", + "syn 2.0.96", ] [[package]] @@ -6120,9 +6202,9 @@ dependencies = [ [[package]] name = "zune-jpeg" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16099418600b4d8f028622f73ff6e3deaabdff330fb9a2a131dea781ee8b0768" +checksum = "99a5bab8d7dedf81405c4bb1f2b83ea057643d9cb28778cea9eecddeedd2e028" dependencies = [ "zune-core", ] diff --git a/Cargo.toml b/Cargo.toml index 6fd4b51d7e4..df7f2a73e56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "backends/v3", "backends/grpc-metadata", "backends/trtllm", + "backends/llamacpp", "launcher", "router" ] diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp new file mode 100644 index 00000000000..7f083ec20cb --- /dev/null +++ b/Dockerfile_llamacpp @@ -0,0 +1,77 @@ +FROM ubuntu:24.04 AS base + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get install -y \ + python3-venv \ + python3-pip + +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" +RUN pip3 install --no-cache-dir transformers + +FROM base AS deps +WORKDIR /deps + +RUN apt-get install -y \ + clang cmake git + +# nvidia-cuda-toolkit +# -DGGML_CUDA=ON \ + +ENV LLAMA_VERSION=b4585 +RUN git clone --depth 1 -b ${LLAMA_VERSION} https://github.com/ggerganov/llama.cpp \ + && cd llama.cpp \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF \ + && cmake --build build --config Release -j \ + && cmake --install build + +# ENV MIMALLOC_VERSION=v3.0.1 +# RUN git clone --depth 1 -b ${MIMALLOC_VERSION} https://github.com/microsoft/mimalloc \ +# && cd mimalloc \ +# && cmake -B build \ +# -DCMAKE_INSTALL_PREFIX=/usr \ +# -DCMAKE_INSTALL_LIBDIR=/usr/lib \ +# -DCMAKE_C_COMPILER=clang \ +# -DCMAKE_CXX_COMPILER=clang++ \ +# && cmake --build build --config Release -j \ +# && cmake --install build + +RUN apt-get install -y \ + curl pkg-config libssl-dev + +WORKDIR /app +COPY rust-toolchain.toml rust-toolchain.toml +RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none +ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked + +FROM deps AS planner +COPY . . +RUN cargo chef prepare --recipe-path recipe.json + +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ + --profile release-opt \ + --package text-generation-router-llamacpp +COPY . . +RUN cargo build \ + --profile release-opt \ + --package text-generation-router-llamacpp --frozen + +FROM base AS runtime + +COPY --from=deps /usr/lib/libllama.so /usr/lib/ +COPY --from=deps /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /bin/text-generation-launcher + +ENTRYPOINT ["text-generation-launcher"] diff --git a/backends/llamacpp/.cargo/config.toml b/backends/llamacpp/.cargo/config.toml new file mode 100644 index 00000000000..ddff4407b90 --- /dev/null +++ b/backends/llamacpp/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["-C", "target-cpu=native"] diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml new file mode 100644 index 00000000000..b1ff3c3fc7f --- /dev/null +++ b/backends/llamacpp/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "text-generation-router-llamacpp" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true + +[build-dependencies] +bindgen = "0.71.1" +pkg-config = "0.3.31" + +[dependencies] +async-trait = "0.1.85" +clap = "4.5.27" +text-generation-router = { path = "../../router" } +thiserror = "2.0.11" +tokenizers.workspace = true +tokio = "1.43.0" +tokio-stream = "0.1.17" +tracing = "0.1.41" diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs new file mode 100644 index 00000000000..844da32109d --- /dev/null +++ b/backends/llamacpp/build.rs @@ -0,0 +1,20 @@ +use std::env; +use std::path::PathBuf; + +fn main() { + let bindings = bindgen::Builder::default() + .header("src/wrapper.h") + .prepend_enum_name(false) + .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) + .generate() + .expect("Unable to generate bindings"); + + let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); + bindings + .write_to_file(out_path.join("bindings.rs")) + .expect("Couldn't write bindings!"); + + pkg_config::Config::new().probe("llama").unwrap(); + + println!("cargo::rerun-if-changed=build.rs"); +} diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs new file mode 100644 index 00000000000..bb61b4ade98 --- /dev/null +++ b/backends/llamacpp/src/backend.rs @@ -0,0 +1,434 @@ +mod bindings { + #![allow(non_upper_case_globals)] + #![allow(non_camel_case_types)] + #![allow(non_snake_case)] + #![allow(dead_code)] + include!(concat!(env!("OUT_DIR"), "/bindings.rs")); +} +use async_trait::async_trait; +use std::ffi::CString; +use std::sync::Once; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; +use text_generation_router::validation::{ValidGenerateRequest}; +use text_generation_router::{FinishReason, Token}; +use thiserror::Error; +use tokenizers::Tokenizer; +use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; +use tokio::sync::{watch, oneshot}; +use tokio::task::spawn_blocking; +use tokio::time::Instant; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::{debug, info, warn, error, trace}; +use tracing::{instrument}; + +pub struct LlamacppConfig { + pub model_gguf: String, + pub n_ctx: u32, + pub n_threads: i32, + pub use_mmap: bool, + pub use_mlock: bool, + pub flash_attention: bool, +} + +#[derive(Debug)] +struct LlamacppRequest { + input_ids: Vec, + top_k: i32, + top_p: f32, + typical_p: f32, + min_keep: usize, + temp: f32, + seed: u32, + penalty_last_n: i32, + penalty_repeat: f32, + penalty_freq: f32, + penalty_present: f32, + max_new_tokens: usize, + tx: UnboundedSender>, + time: Instant, +} + +pub struct LlamacppBackend { + tx: UnboundedSender, + status: watch::Receiver, +} + +impl LlamacppRequest { + fn new( + from: &ValidGenerateRequest, + tx: UnboundedSender>, + ) -> Option{ + if let Some(input_ids) = from.input_ids.as_ref() { + Some(LlamacppRequest { + input_ids: input_ids.iter().map(|&x| x as i32).collect(), + top_k: from.parameters.top_k as _, + top_p: from.parameters.top_p as _, + typical_p: from.parameters.typical_p as _, + min_keep: 0, // disabled + temp: from.parameters.temperature as _, + seed: from.parameters.seed as _, + penalty_last_n: -1, // 0 = disabled, -1 = context size + penalty_repeat: from.parameters.repetition_penalty as _, + penalty_freq: from.parameters.frequency_penalty as _, + penalty_present: 0.0, // disabled + max_new_tokens: from.stopping_parameters.max_new_tokens as _, + tx: tx, + time: Instant::now(), + }) + } else { + None + } + } +} + +struct Llamacpp { + model: *mut bindings::llama_model, + ctx: *mut bindings::llama_context, + vocab: *const bindings::llama_vocab, + n_ctx: u32, +} + +extern "C" fn llamacpp_log_callback( + level: bindings::ggml_log_level, + msg: *const std::os::raw::c_char, + _user_data: *mut std::os::raw::c_void, +) { + let cmsg = unsafe { std::ffi::CStr::from_ptr(msg) }; + let rmsg = cmsg.to_string_lossy().trim_end_matches('\n').to_string(); + + match level { + bindings::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg), + bindings::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg), + bindings::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg), + bindings::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg), + _ => trace!(target: "llamacpp", "{}", rmsg), + } +} + +impl Llamacpp { + fn new(conf: LlamacppConfig) -> Result { + let gguf = CString::new(conf.model_gguf)?; + + let model = unsafe { + let mut params = bindings::llama_model_default_params(); + params.use_mmap = conf.use_mmap; + params.use_mlock = conf.use_mlock; + bindings::llama_model_load_from_file(gguf.as_ptr(), params) + }; + if model.is_null() { + return Err(BackendError::Llamacpp("Failed to load model".to_string())) + } + let ctx = unsafe { + let mut params = bindings::llama_context_default_params(); + params.n_ctx = conf.n_ctx; + params.n_threads = conf.n_threads; + params.n_threads_batch = conf.n_threads; + params.flash_attn = conf.flash_attention; + params.no_perf = true; + bindings::llama_init_from_model(model, params) + }; + if ctx.is_null() { + return Err(BackendError::Llamacpp("Failed to init context".to_string())) + } + let n_ctx = unsafe { bindings::llama_n_ctx(ctx) }; + + let vocab = unsafe { + bindings::llama_model_get_vocab(model) + }; + if vocab.is_null() { + return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); + } + Ok(Llamacpp{model, ctx, vocab, n_ctx}) + } + // useless ? + fn warmup(&self) { + let mut buf: Vec = Vec::new(); + + let bos = unsafe { + bindings::llama_vocab_bos(self.vocab) + }; + if bos != bindings::LLAMA_TOKEN_NULL { + buf.push(bos); + } + let eos = unsafe { + bindings::llama_vocab_eos(self.vocab) + }; + if eos != bindings::LLAMA_TOKEN_NULL { + buf.push(eos); + } + if buf.is_empty() { + warn!("Warmup failed: no bos/eos..."); + return; + } + let batch = unsafe { + bindings::llama_batch_get_one(buf.as_ptr() as _, buf.len() as _) + }; + if unsafe { bindings::llama_decode(self.ctx, batch) } != 0 { + error!("Warmup failed: llama_decode() returned an error"); + } + unsafe { + bindings::llama_kv_cache_clear(self.ctx); + bindings::llama_synchronize(self.ctx); + } + } +} + +impl Drop for Llamacpp { + fn drop(&mut self) { + if !self.ctx.is_null() { + unsafe { bindings::llama_free(self.ctx) }; + } + if !self.model.is_null() { + unsafe { bindings::llama_model_free(self.model) }; + } + } +} + +struct LlamacppSampler { + chain: *mut bindings::llama_sampler, +} + +impl LlamacppSampler { + fn new(req: &LlamacppRequest) -> Option { + let chain = unsafe { + let params = bindings::llama_sampler_chain_default_params(); + bindings::llama_sampler_chain_init(params) + }; + if chain.is_null() { + error!("Failed to init sampler"); + return None; + } + let top_k = unsafe { + bindings::llama_sampler_init_top_k(req.top_k) + }; + let top_p = unsafe { + bindings::llama_sampler_init_top_p(req.top_p, req.min_keep) + }; + let typical_p = unsafe { + bindings::llama_sampler_init_typical(req.typical_p, req.min_keep) + }; + let temp = unsafe { + bindings::llama_sampler_init_temp(req.temp) + }; + let penalties = unsafe { + bindings::llama_sampler_init_penalties( + req.penalty_last_n, + req.penalty_repeat, + req.penalty_freq, + req.penalty_present, + ) + }; + let dist = unsafe { + bindings::llama_sampler_init_dist(req.seed) + }; + let mut failed = false; + + for (k, v) in &[("top_k", top_k), + ("top_p", top_p), + ("typical_p", typical_p), + ("temp", temp), + ("penalties", penalties), + ("dist", dist)] { + if v.is_null() { + error!("Failed to init {k} sampler"); + failed = true; + } else { + unsafe { bindings::llama_sampler_chain_add(chain, *v) }; + } + } + if failed { + None + } else { + Some(LlamacppSampler{chain}) + } + } + + fn sample(&self, llamacpp: &Llamacpp) -> bindings::llama_token { + // use apply/accept ? + unsafe { bindings::llama_sampler_sample(self.chain, llamacpp.ctx, -1) }// -1 ? + } +} + +impl Drop for LlamacppSampler { + fn drop(&mut self) { + if !self.chain.is_null() { + unsafe { bindings::llama_sampler_free(self.chain) }; + } + } +} + +static INIT: Once = Once::new(); + +impl LlamacppBackend { + pub fn new( + conf: LlamacppConfig, + tokenizer: Tokenizer, + ) -> (Self, oneshot::Receiver>) { + + // Setup llama & export logs, once and for all + INIT.call_once(|| unsafe { + bindings::llama_log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); + bindings::llama_backend_init(); + bindings::llama_numa_init(bindings::GGML_NUMA_STRATEGY_NUMACTL); // TODO add option & test + }); + + let (status_tx, status_rx) = watch::channel(false); + let (ok_tx, ok_rx) = oneshot::channel(); + let (tx, mut rx) = unbounded_channel::(); + + spawn_blocking(move || { + let llamacpp = match Llamacpp::new(conf) { + Ok(v) => { let _ = ok_tx.send(Ok(())); v }, + Err(e) => { let _ = ok_tx.send(Err(e)); return; }, + }; + llamacpp.warmup(); + + let vocab = tokenizer.get_added_vocabulary(); + + // health() returns true + let _ = status_tx.send(true); + + while let Some(request) = rx.blocking_recv() { + debug!("Request: {:?}", request); + + let start_time = Instant::now(); + + // TODO: do a real batch + let mut batch = unsafe { + bindings::llama_batch_get_one( + request.input_ids.as_ptr() as _, + request.input_ids.len() as _, + ) + }; + // TODO: move up for perf ? + let sampler = match LlamacppSampler::new(&request) { + Some(sampler) => sampler, + _ => { + let _ = request.tx.send(Err(InferError::IncompleteGeneration)); + continue; + }, + }; + let mut text = String::with_capacity(1024); + let mut n_tokens: usize = 0; + + loop { + debug!(?batch); + match unsafe { bindings::llama_decode(llamacpp.ctx, batch) } { + 0 => { }, + 1 => { + unsafe { + // TODO: seq_rm & seq_add if model is compatible + bindings::llama_kv_cache_clear(llamacpp.ctx); + } + let _ = request.tx.send(Err(InferError::IncompleteGeneration)); + continue; + }, + _ => { + debug!("decode return <0"); + let _ = request.tx.send(Err(InferError::IncompleteGeneration)); + break; + }, + }; + let mut next = sampler.sample(&llamacpp); + n_tokens += 1; + debug!(?n_tokens); + + let logits = unsafe { + *bindings::llama_get_logits_ith(llamacpp.ctx, -1) + }; + let kv_cache_used_cells = unsafe { + bindings::llama_get_kv_cache_used_cells(llamacpp.ctx) + }; + let piece = match tokenizer.decode(&[next as u32], false) { + Ok(piece) => piece, + Err(e) => { + error!("Failed to decode token: {e}"); + let _ = request.tx.send(Err(InferError::IncompleteGeneration)); + break; + }, + }; + let special = vocab.is_special_token(&piece); + + if !special { + text.push_str(&piece); + } + let token = Token { + id: next as _, + text: piece, + logprob: logits as _, + special: special, + }; + let finish: Option = { + if unsafe { bindings::llama_vocab_is_eog(llamacpp.vocab, next) } { + Some(FinishReason::EndOfSequenceToken) + } else if n_tokens == request.max_new_tokens { + Some(FinishReason::Length) + } else if kv_cache_used_cells == llamacpp.n_ctx as i32 { + Some(FinishReason::Length) // TODO: check + } else { + None + } + }; + if let Some(reason) = finish { + let _ = request.tx.send(Ok(InferStreamResponse::End { + token: token, + top_tokens: vec![], + generated_text: GeneratedText { + text: text, + generated_tokens: n_tokens as _, + finish_reason: reason, + seed: Some(request.seed as _), + }, + start: start_time, + queued: request.time, + })); + break; + } + let _ = request.tx.send(Ok(InferStreamResponse::Intermediate { + token: token, + top_tokens: vec![], + })); + batch = unsafe { + bindings::llama_batch_get_one(&mut next, 1) + }; + } + } + }); + (Self{tx, status: status_rx}, ok_rx) + } +} + +#[async_trait] +impl Backend for LlamacppBackend { + #[instrument(skip_all)] + fn schedule( + &self, + request: ValidGenerateRequest, + ) -> Result>, InferError> { + debug!(?request); + let (tx, rx) = unbounded_channel::>(); + match LlamacppRequest::new(&request, tx) { + Some(v) => match self.tx.send(v) { + Err(e) => Err(InferError::GenerationError(e.to_string())), + _ => Ok(UnboundedReceiverStream::new(rx)), + }, + _ => Err(InferError::GenerationError("Bad request".to_string())), + } + } + + async fn health(&self, _: bool) -> bool { + *self.status.borrow() + } + + fn name(&self) -> &'static str { + "llamacpp" + } +} + +#[derive(Debug, Error)] +pub enum BackendError { + #[error("CString error: {0}")] + CStringError(#[from] std::ffi::NulError), + #[error("Llamacpp error: {0}")] + Llamacpp(String), +} diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs new file mode 100644 index 00000000000..00d84cebaa4 --- /dev/null +++ b/backends/llamacpp/src/main.rs @@ -0,0 +1,210 @@ +mod backend; + +use backend::{LlamacppConfig, LlamacppBackend, BackendError}; +use clap::{Parser}; +use text_generation_router::{logging, server, usage_stats}; +use thiserror::Error; +use tokenizers::{Tokenizer, FromPretrainedParameters}; +use tokio::sync::oneshot::error::RecvError; +use tracing::error; + +/// Backend Configuration +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + /// Name of the model to load. + #[clap(long, env)] + model_id: String, + + /// Revision of the model. + #[clap(default_value = "main", long, env)] + revision: String, + + /// Path to the GGUF model file to be used for inference. + #[clap(long, env)] + model_gguf: String, // TODO Option() with hf->gguf & quantize + + /// Context size for the model. + #[clap(default_value = "4096", long, env)] + n_ctx: u32, + + /// Number of threads to use for inference. + #[clap(default_value = "1", long, env)] + n_threads: i32, + + #[clap(default_value = "true", long, env)] + /// Whether to use memory mapping. + use_mmap: bool, + + #[clap(default_value = "false", long, env)] + /// Whether to use memory locking. + use_mlock: bool, + + /// Enable flash attention for faster inference. (EXPERIMENTAL) + #[clap(default_value = "false", long, env)] + flash_attention: bool, + + /// TODO + #[clap(default_value = "2", long, env)] + validation_workers: usize, + #[clap(default_value = "128", long, env)] + max_concurrent_requests: usize, + #[clap(default_value = "2", long, env)] + max_best_of: usize, + #[clap(default_value = "4", long, env)] + max_stop_sequences: usize, + #[clap(default_value = "5", long, env)] + max_top_n_tokens: u32, + + /// Maximum number of input tokens allowed per request. + #[clap(default_value = "1024", long, env)] + max_input_tokens: usize, + + /// Maximum total tokens (input + output) allowed per request. + #[clap(default_value = "2048", long, env)] + max_total_tokens: usize, + +// #[clap(default_value = "1.2", long, env)] +// waiting_served_ratio: f32, +// #[clap(default_value = "4096", long, env)] +// max_batch_prefill_tokens: u32, +// #[clap(long, env)] +// max_batch_total_tokens: Option, +// #[clap(default_value = "20", long, env)] +// max_waiting_tokens: usize, +// #[clap(long, env)] +// max_batch_size: Option, + + /// The IP address to listen on + #[clap(default_value = "0.0.0.0", long, env)] + hostname: String, + + /// The port to listen on. + #[clap(default_value = "3001", long, short, env)] + port: u16, + +// #[clap(default_value = "/tmp/text-generation-server-0", long, env)] +// master_shard_uds_path: String, +// #[clap(long, env)] +// tokenizer_name: String, +// #[clap(long, env)] +// tokenizer_config_path: Option, +// #[clap(long, env, value_enum)] +// trust_remote_code: bool, +// #[clap(long, env)] +// api_key: Option, + + #[clap(long, env)] + json_output: bool, + #[clap(long, env)] + otlp_endpoint: Option, + #[clap(default_value = "text-generation-inference.router", long, env)] + otlp_service_name: String, + #[clap(long, env)] + cors_allow_origin: Option>, + #[clap(long, env)] + ngrok: bool, + #[clap(long, env)] + ngrok_authtoken: Option, + #[clap(long, env)] + ngrok_edge: Option, + #[clap(long, env)] + tokenizer_config_path: Option, + #[clap(long, env, default_value_t = false)] + disable_grammar_support: bool, + #[clap(default_value = "4", long, env)] + max_client_batch_size: usize, + #[clap(default_value = "on", long, env)] + usage_stats: usage_stats::UsageStatsLevel, + #[clap(default_value = "2000000", long, env)] + payload_limit: usize, +} + +#[tokio::main] +async fn main() -> Result<(), RouterError> { + let args = Args::parse(); + + logging::init_logging( + args.otlp_endpoint, + args.otlp_service_name, + args.json_output + ); + + if args.max_input_tokens >= args.max_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_input_tokens` must be < `max_total_tokens`".to_string(), + )); + } + + // TODO: check if we use the same cache of Server + // check if llamacpp is faster + let tokenizer = { + let token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); + let params = FromPretrainedParameters { + revision: args.revision.clone(), + token: token, + ..Default::default() + }; + Tokenizer::from_pretrained( + args.model_id.clone(), + Some(params) + )? + }; + + let (backend, ok) = LlamacppBackend::new( + LlamacppConfig { + model_gguf: args.model_gguf, + n_ctx: args.n_ctx, + n_threads: args.n_threads, + use_mmap: args.use_mmap, + use_mlock: args.use_mlock, + flash_attention: args.flash_attention, + }, + tokenizer, + ); + ok.await??; + + server::run( + backend, + args.max_concurrent_requests, + args.max_best_of, + args.max_stop_sequences, + args.max_top_n_tokens, + args.max_input_tokens, + args.max_total_tokens, + args.validation_workers, + None, // api_key + args.model_id, // tokenizer_name + args.tokenizer_config_path, + Some(args.revision), + false, // trust_remote_code + args.hostname, + args.port, + args.cors_allow_origin, + args.ngrok, + args.ngrok_authtoken, + args.ngrok_edge, + args.disable_grammar_support, + args.max_client_batch_size, + args.usage_stats, + args.payload_limit, + ) + .await?; + Ok(()) +} + +#[derive(Debug, Error)] +enum RouterError { + #[error("Argument validation error: {0}")] + ArgumentValidation(String), + #[error("Tokenizer error: {0}")] + Tokenizer(#[from] tokenizers::Error), + #[error("Backend error: {0}")] + Backend(#[from] BackendError), + #[error("WebServer error: {0}")] + WebServer(#[from] server::WebServerError), + #[error("Recv error: {0}")] + RecvError(#[from] RecvError), +} diff --git a/backends/llamacpp/src/wrapper.h b/backends/llamacpp/src/wrapper.h new file mode 100644 index 00000000000..630ebeec15a --- /dev/null +++ b/backends/llamacpp/src/wrapper.h @@ -0,0 +1 @@ +#include From bd0cc9905c672b643aecc450a40b9f3be26e18b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 30 Jan 2025 13:41:35 +0000 Subject: [PATCH 02/63] Get rid of llama_batch_get_one() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 128 +++++++++++++++++++++++-------- backends/llamacpp/src/main.rs | 2 + 2 files changed, 100 insertions(+), 30 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index bb61b4ade98..80c04bc326f 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -7,7 +7,7 @@ mod bindings { } use async_trait::async_trait; use std::ffi::CString; -use std::sync::Once; +use std::sync::{mpsc, Once}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::{ValidGenerateRequest}; use text_generation_router::{FinishReason, Token}; @@ -15,8 +15,8 @@ use thiserror::Error; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::sync::{watch, oneshot}; -use tokio::task::spawn_blocking; -use tokio::time::Instant; +use tokio::task::{spawn, spawn_blocking}; +use tokio::time::{Duration, Instant, timeout}; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, info, warn, error, trace}; use tracing::{instrument}; @@ -24,6 +24,8 @@ use tracing::{instrument}; pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: u32, + pub batch_size: usize, + pub batch_timeout: Duration, pub n_threads: i32, pub use_mmap: bool, pub use_mlock: bool, @@ -85,6 +87,7 @@ struct Llamacpp { model: *mut bindings::llama_model, ctx: *mut bindings::llama_context, vocab: *const bindings::llama_vocab, + batch: bindings::llama_batch, n_ctx: u32, } @@ -138,8 +141,39 @@ impl Llamacpp { if vocab.is_null() { return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); } - Ok(Llamacpp{model, ctx, vocab, n_ctx}) + let batch = unsafe { + bindings::llama_batch_init(4096, 0, 5) + }; + // TODO check batch + Ok(Llamacpp{model, ctx, vocab, n_ctx, batch}) } + + fn batch_push( + &mut self, + token: bindings::llama_token, + pos: bindings::llama_pos, + seq_ids: &[bindings::llama_seq_id], + logits: bool, + ) { + // TODO check evertyhing.. + let n = self.batch.n_tokens as usize; + + unsafe { + *self.batch.token.add(n) = token; + *self.batch.pos.add(n) = pos; + *self.batch.n_seq_id.add(n) = seq_ids.len() as i32; + } + for (i, &seq_id) in seq_ids.iter().enumerate() { + unsafe { + *(*self.batch.seq_id.add(n)).add(i) = seq_id; + } + } + unsafe { + *self.batch.logits.add(n) = logits as i8; + } + self.batch.n_tokens += 1; + } + // useless ? fn warmup(&self) { let mut buf: Vec = Vec::new(); @@ -181,6 +215,7 @@ impl Drop for Llamacpp { if !self.model.is_null() { unsafe { bindings::llama_model_free(self.model) }; } + unsafe { bindings::llama_batch_free(self.batch) }; } } @@ -223,12 +258,12 @@ impl LlamacppSampler { }; let mut failed = false; - for (k, v) in &[("top_k", top_k), - ("top_p", top_p), + for (k, v) in &[( "top_k", top_k ), + ( "top_p", top_p ), ("typical_p", typical_p), - ("temp", temp), + ( "temp", temp ), ("penalties", penalties), - ("dist", dist)] { + ( "dist", dist )] { if v.is_null() { error!("Failed to init {k} sampler"); failed = true; @@ -275,9 +310,33 @@ impl LlamacppBackend { let (status_tx, status_rx) = watch::channel(false); let (ok_tx, ok_rx) = oneshot::channel(); let (tx, mut rx) = unbounded_channel::(); + let (sync_tx, sync_rx) = mpsc::channel(); + + spawn(async move { + let mut requests = Vec::new(); + + loop { + match timeout(conf.batch_timeout, rx.recv()).await { + Ok(None) => break, // closed + Ok(Some(request)) => { + requests.push(request); + if requests.len() >= conf.batch_size { + let _ = sync_tx.send(requests); + requests = Vec::new(); + } + }, + Err(_) => { + if !requests.is_empty() { + let _ = sync_tx.send(requests); + requests = Vec::new(); + } + } + } + } + }); spawn_blocking(move || { - let llamacpp = match Llamacpp::new(conf) { + let mut llamacpp = match Llamacpp::new(conf) { Ok(v) => { let _ = ok_tx.send(Ok(())); v }, Err(e) => { let _ = ok_tx.send(Err(e)); return; }, }; @@ -288,18 +347,25 @@ impl LlamacppBackend { // health() returns true let _ = status_tx.send(true); - while let Some(request) = rx.blocking_recv() { - debug!("Request: {:?}", request); - - let start_time = Instant::now(); + while let Ok(requests) = sync_rx.recv() { // TODO: do a real batch - let mut batch = unsafe { - bindings::llama_batch_get_one( - request.input_ids.as_ptr() as _, - request.input_ids.len() as _, - ) - }; + for (_seq_id, request) in requests.iter().enumerate() { + + debug!("Request: {:?}", request); + let start_time = Instant::now(); + llamacpp.batch.n_tokens = 0; + + for (pos, &token_id) in request.input_ids.iter().enumerate() { + llamacpp.batch_push( + token_id as bindings::llama_token, + pos as bindings::llama_pos, + &[/* seq_id */ 0 as bindings::llama_seq_id], + true, + ); + } + // TODO: close this loop :) + // TODO: move up for perf ? let sampler = match LlamacppSampler::new(&request) { Some(sampler) => sampler, @@ -310,10 +376,10 @@ impl LlamacppBackend { }; let mut text = String::with_capacity(1024); let mut n_tokens: usize = 0; + let mut n_new_tokens: usize = 0; loop { - debug!(?batch); - match unsafe { bindings::llama_decode(llamacpp.ctx, batch) } { + match unsafe { bindings::llama_decode(llamacpp.ctx, llamacpp.batch) } { 0 => { }, 1 => { unsafe { @@ -321,7 +387,7 @@ impl LlamacppBackend { bindings::llama_kv_cache_clear(llamacpp.ctx); } let _ = request.tx.send(Err(InferError::IncompleteGeneration)); - continue; + break; }, _ => { debug!("decode return <0"); @@ -329,9 +395,11 @@ impl LlamacppBackend { break; }, }; - let mut next = sampler.sample(&llamacpp); - n_tokens += 1; - debug!(?n_tokens); + let next = sampler.sample(&llamacpp); + n_tokens += llamacpp.batch.n_tokens as usize; + n_new_tokens += llamacpp.batch.n_tokens as usize; + + debug!("tokens: {n_tokens} new: {n_new_tokens}"); let logits = unsafe { *bindings::llama_get_logits_ith(llamacpp.ctx, -1) @@ -361,7 +429,7 @@ impl LlamacppBackend { let finish: Option = { if unsafe { bindings::llama_vocab_is_eog(llamacpp.vocab, next) } { Some(FinishReason::EndOfSequenceToken) - } else if n_tokens == request.max_new_tokens { + } else if n_new_tokens == request.max_new_tokens { Some(FinishReason::Length) } else if kv_cache_used_cells == llamacpp.n_ctx as i32 { Some(FinishReason::Length) // TODO: check @@ -375,7 +443,7 @@ impl LlamacppBackend { top_tokens: vec![], generated_text: GeneratedText { text: text, - generated_tokens: n_tokens as _, + generated_tokens: n_new_tokens as _, finish_reason: reason, seed: Some(request.seed as _), }, @@ -388,11 +456,11 @@ impl LlamacppBackend { token: token, top_tokens: vec![], })); - batch = unsafe { - bindings::llama_batch_get_one(&mut next, 1) - }; + llamacpp.batch.n_tokens = 0; + llamacpp.batch_push(next, n_tokens as _, &[0], true); } } + } // TODO remove this }); (Self{tx, status: status_rx}, ok_rx) } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 00d84cebaa4..800792e5a2f 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -161,6 +161,8 @@ async fn main() -> Result<(), RouterError> { use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, + batch_size: 5, + batch_timeout: tokio::time::Duration::from_millis(100), }, tokenizer, ); From 3eb4823f3e88c5eb803d1392fc61ec5a4fac4b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 30 Jan 2025 15:12:55 +0000 Subject: [PATCH 03/63] Use max_batch_total_tokens MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 14 +++++++++----- backends/llamacpp/src/main.rs | 23 +++++++++++++---------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 80c04bc326f..887267568b8 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -24,7 +24,7 @@ use tracing::{instrument}; pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: u32, - pub batch_size: usize, + pub max_batch_total_tokens: u32, pub batch_timeout: Duration, pub n_threads: i32, pub use_mmap: bool, @@ -142,7 +142,7 @@ impl Llamacpp { return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); } let batch = unsafe { - bindings::llama_batch_init(4096, 0, 5) + bindings::llama_batch_init(conf.max_batch_total_tokens as _, 0, 1) }; // TODO check batch Ok(Llamacpp{model, ctx, vocab, n_ctx, batch}) @@ -313,21 +313,25 @@ impl LlamacppBackend { let (sync_tx, sync_rx) = mpsc::channel(); spawn(async move { + let mut n_tokens = 0; let mut requests = Vec::new(); loop { match timeout(conf.batch_timeout, rx.recv()).await { Ok(None) => break, // closed Ok(Some(request)) => { - requests.push(request); - if requests.len() >= conf.batch_size { + if n_tokens + request.input_ids.len() > conf.max_batch_total_tokens as usize { let _ = sync_tx.send(requests); - requests = Vec::new(); + n_tokens = request.input_ids.len(); + requests = vec![request]; + } else { + requests.push(request); } }, Err(_) => { if !requests.is_empty() { let _ = sync_tx.send(requests); + n_tokens = 0; requests = Vec::new(); } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 800792e5a2f..ea6743a77d6 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -68,8 +68,11 @@ struct Args { // waiting_served_ratio: f32, // #[clap(default_value = "4096", long, env)] // max_batch_prefill_tokens: u32, -// #[clap(long, env)] -// max_batch_total_tokens: Option, + + /// Maximum tokens within a batch + #[clap(default_value = "1024", long, env)] + max_batch_total_tokens: u32, + // #[clap(default_value = "20", long, env)] // max_waiting_tokens: usize, // #[clap(long, env)] @@ -155,14 +158,14 @@ async fn main() -> Result<(), RouterError> { let (backend, ok) = LlamacppBackend::new( LlamacppConfig { - model_gguf: args.model_gguf, - n_ctx: args.n_ctx, - n_threads: args.n_threads, - use_mmap: args.use_mmap, - use_mlock: args.use_mlock, - flash_attention: args.flash_attention, - batch_size: 5, - batch_timeout: tokio::time::Duration::from_millis(100), + model_gguf: args.model_gguf, + n_ctx: args.n_ctx, + n_threads: args.n_threads, + use_mmap: args.use_mmap, + use_mlock: args.use_mlock, + flash_attention: args.flash_attention, + max_batch_total_tokens: args.max_batch_total_tokens, + batch_timeout: tokio::time::Duration::from_millis(100), }, tokenizer, ); From e7facf692f9a73e00743f9e92f10e2b3c58a47e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 30 Jan 2025 19:50:09 +0000 Subject: [PATCH 04/63] Handle max_batch_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 14 ++++++++++++-- backends/llamacpp/src/main.rs | 7 +++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 887267568b8..38b21ce2377 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -25,6 +25,7 @@ pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: u32, pub max_batch_total_tokens: u32, + pub max_batch_size: Option, pub batch_timeout: Duration, pub n_threads: i32, pub use_mmap: bool, @@ -320,13 +321,22 @@ impl LlamacppBackend { match timeout(conf.batch_timeout, rx.recv()).await { Ok(None) => break, // closed Ok(Some(request)) => { + if let Some(max_batch_size) = conf.max_batch_size { + if requests.len() + 1 == max_batch_size { + requests.push(request); + let _ = sync_tx.send(requests); + n_tokens = 0; + requests = Vec::new(); + continue; + } + } if n_tokens + request.input_ids.len() > conf.max_batch_total_tokens as usize { let _ = sync_tx.send(requests); n_tokens = request.input_ids.len(); requests = vec![request]; - } else { - requests.push(request); + continue; } + requests.push(request); }, Err(_) => { if !requests.is_empty() { diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index ea6743a77d6..a9bebd888f7 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -75,8 +75,10 @@ struct Args { // #[clap(default_value = "20", long, env)] // max_waiting_tokens: usize, -// #[clap(long, env)] -// max_batch_size: Option, + + /// Maximum number of requests per batch + #[clap(long, env)] + max_batch_size: Option, /// The IP address to listen on #[clap(default_value = "0.0.0.0", long, env)] @@ -165,6 +167,7 @@ async fn main() -> Result<(), RouterError> { use_mlock: args.use_mlock, flash_attention: args.flash_attention, max_batch_total_tokens: args.max_batch_total_tokens, + max_batch_size: args.max_batch_size, batch_timeout: tokio::time::Duration::from_millis(100), }, tokenizer, From a7b4b04cb58b2697b5f6e935de74dae04704dce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 30 Jan 2025 20:21:37 +0000 Subject: [PATCH 05/63] Add some input validation checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 12 ++++++------ backends/llamacpp/src/main.rs | 25 +++++++++++++++++++++---- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 38b21ce2377..85976779509 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -23,11 +23,11 @@ use tracing::{instrument}; pub struct LlamacppConfig { pub model_gguf: String, - pub n_ctx: u32, - pub max_batch_total_tokens: u32, + pub n_ctx: usize, + pub max_batch_total_tokens: usize, pub max_batch_size: Option, pub batch_timeout: Duration, - pub n_threads: i32, + pub n_threads: isize, pub use_mmap: bool, pub use_mlock: bool, pub flash_attention: bool, @@ -124,9 +124,9 @@ impl Llamacpp { } let ctx = unsafe { let mut params = bindings::llama_context_default_params(); - params.n_ctx = conf.n_ctx; - params.n_threads = conf.n_threads; - params.n_threads_batch = conf.n_threads; + params.n_ctx = conf.n_ctx as _; + params.n_threads = conf.n_threads as _; + params.n_threads_batch = conf.n_threads as _; params.flash_attn = conf.flash_attention; params.no_perf = true; bindings::llama_init_from_model(model, params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index a9bebd888f7..33614059368 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -26,11 +26,11 @@ struct Args { /// Context size for the model. #[clap(default_value = "4096", long, env)] - n_ctx: u32, + n_ctx: usize, /// Number of threads to use for inference. #[clap(default_value = "1", long, env)] - n_threads: i32, + n_threads: isize, #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. @@ -70,8 +70,8 @@ struct Args { // max_batch_prefill_tokens: u32, /// Maximum tokens within a batch - #[clap(default_value = "1024", long, env)] - max_batch_total_tokens: u32, + #[clap(default_value = "4096", long, env)] + max_batch_total_tokens: usize, // #[clap(default_value = "20", long, env)] // max_waiting_tokens: usize, @@ -140,6 +140,23 @@ async fn main() -> Result<(), RouterError> { "`max_input_tokens` must be < `max_total_tokens`".to_string(), )); } + if args.max_total_tokens > args.max_batch_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), + )); + } + if let Some(max_batch_size) = args.max_batch_size { + if max_batch_size * args.max_total_tokens > args.max_batch_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), + )); + } + } + if args.max_batch_total_tokens > args.n_ctx { + return Err(RouterError::ArgumentValidation( + "`max_batch_total_tokens` must be <= `n_ctx`".to_string(), + )); + } // TODO: check if we use the same cache of Server // check if llamacpp is faster From 8d2dfdf668ae8f5c0d6ad241ee8e4edde1b85aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 30 Jan 2025 22:41:26 +0000 Subject: [PATCH 06/63] Handle ctx args & fix sampling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 2 +- backends/llamacpp/src/backend.rs | 101 ++++++++++++++++++++++--------- backends/llamacpp/src/main.rs | 16 +++-- 3 files changed, 80 insertions(+), 39 deletions(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 7f083ec20cb..006a320412f 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -18,7 +18,7 @@ RUN apt-get install -y \ # nvidia-cuda-toolkit # -DGGML_CUDA=ON \ -ENV LLAMA_VERSION=b4585 +ENV LLAMA_VERSION=b4599 RUN git clone --depth 1 -b ${LLAMA_VERSION} https://github.com/ggerganov/llama.cpp \ && cd llama.cpp \ && cmake -B build \ diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 85976779509..0d3c3950952 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -25,9 +25,9 @@ pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: usize, pub max_batch_total_tokens: usize, - pub max_batch_size: Option, + pub max_batch_size: usize, pub batch_timeout: Duration, - pub n_threads: isize, + pub n_threads: usize, pub use_mmap: bool, pub use_mlock: bool, pub flash_attention: bool, @@ -88,6 +88,7 @@ struct Llamacpp { model: *mut bindings::llama_model, ctx: *mut bindings::llama_context, vocab: *const bindings::llama_vocab, + logprobs: Vec, batch: bindings::llama_batch, n_ctx: u32, } @@ -115,7 +116,7 @@ impl Llamacpp { let model = unsafe { let mut params = bindings::llama_model_default_params(); - params.use_mmap = conf.use_mmap; + params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; bindings::llama_model_load_from_file(gguf.as_ptr(), params) }; @@ -124,11 +125,14 @@ impl Llamacpp { } let ctx = unsafe { let mut params = bindings::llama_context_default_params(); - params.n_ctx = conf.n_ctx as _; - params.n_threads = conf.n_threads as _; - params.n_threads_batch = conf.n_threads as _; - params.flash_attn = conf.flash_attention; - params.no_perf = true; + params.n_ctx = conf.n_ctx as _; + params.n_batch = conf.max_batch_total_tokens as _; + params.n_ubatch = conf.max_batch_total_tokens as _; // TODO ? + params.n_seq_max = conf.max_batch_size as _; + params.n_threads = conf.n_threads as _; + params.n_threads_batch = conf.n_threads as _; // TODO ? + params.flash_attn = conf.flash_attention; + params.no_perf = true; bindings::llama_init_from_model(model, params) }; if ctx.is_null() { @@ -142,11 +146,30 @@ impl Llamacpp { if vocab.is_null() { return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); } + let n_tokens = unsafe { + bindings::llama_vocab_n_tokens(vocab) + }; + let mut logprobs = Vec::with_capacity(n_tokens as usize); + + for token in 0..n_tokens { + logprobs.push(bindings::llama_token_data { + id: token, + logit: 0.0, + p: 0.0, + }); + } let batch = unsafe { bindings::llama_batch_init(conf.max_batch_total_tokens as _, 0, 1) }; - // TODO check batch - Ok(Llamacpp{model, ctx, vocab, n_ctx, batch}) + Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch}) + } + + fn batch_clear_logits(&mut self) { + for n in 0..self.batch.n_tokens as usize{ + unsafe { + *self.batch.logits.add(n) = 0 as i8; + } + } } fn batch_push( @@ -156,6 +179,7 @@ impl Llamacpp { seq_ids: &[bindings::llama_seq_id], logits: bool, ) { + debug!("push {token} {pos} {logits}"); // TODO check evertyhing.. let n = self.batch.n_tokens as usize; @@ -279,9 +303,29 @@ impl LlamacppSampler { } } - fn sample(&self, llamacpp: &Llamacpp) -> bindings::llama_token { - // use apply/accept ? - unsafe { bindings::llama_sampler_sample(self.chain, llamacpp.ctx, -1) }// -1 ? + fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (bindings::llama_token, f32) { + let logits = unsafe { + bindings::llama_get_logits_ith(llamacpp.ctx, idx as _) + }; + for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() { + *logprob = bindings::llama_token_data { + id: token as _, + logit: unsafe { *logits.offset(token as _) }, + p: 0.0, + }; + } + let mut view = bindings::llama_token_data_array { + data: llamacpp.logprobs.as_mut_ptr(), + size: llamacpp.logprobs.len(), + selected: -1, + sorted: false, + }; + unsafe { + bindings::llama_sampler_apply(self.chain, &mut view); + let logprob = *view.data.offset(view.selected as _); + bindings::llama_sampler_accept(self.chain, logprob.id); + (logprob.id, logprob.logit) // maybe p.ln() ? + } } } @@ -321,14 +365,12 @@ impl LlamacppBackend { match timeout(conf.batch_timeout, rx.recv()).await { Ok(None) => break, // closed Ok(Some(request)) => { - if let Some(max_batch_size) = conf.max_batch_size { - if requests.len() + 1 == max_batch_size { - requests.push(request); - let _ = sync_tx.send(requests); - n_tokens = 0; - requests = Vec::new(); - continue; - } + if requests.len() + 1 == conf.max_batch_size { + requests.push(request); + let _ = sync_tx.send(requests); + n_tokens = 0; + requests = Vec::new(); + continue; } if n_tokens + request.input_ids.len() > conf.max_batch_total_tokens as usize { let _ = sync_tx.send(requests); @@ -378,6 +420,8 @@ impl LlamacppBackend { true, ); } + let mut pos = request.input_ids.len(); + // TODO: close this loop :) // TODO: move up for perf ? @@ -409,15 +453,12 @@ impl LlamacppBackend { break; }, }; - let next = sampler.sample(&llamacpp); - n_tokens += llamacpp.batch.n_tokens as usize; - n_new_tokens += llamacpp.batch.n_tokens as usize; + let idx = llamacpp.batch.n_tokens as usize - 1; + let (next, logprob) = sampler.sample(&mut llamacpp, idx); + n_new_tokens += 1; debug!("tokens: {n_tokens} new: {n_new_tokens}"); - let logits = unsafe { - *bindings::llama_get_logits_ith(llamacpp.ctx, -1) - }; let kv_cache_used_cells = unsafe { bindings::llama_get_kv_cache_used_cells(llamacpp.ctx) }; @@ -437,7 +478,7 @@ impl LlamacppBackend { let token = Token { id: next as _, text: piece, - logprob: logits as _, + logprob: logprob, special: special, }; let finish: Option = { @@ -471,7 +512,9 @@ impl LlamacppBackend { top_tokens: vec![], })); llamacpp.batch.n_tokens = 0; - llamacpp.batch_push(next, n_tokens as _, &[0], true); + // llamacpp.batch_clear_logits(); + llamacpp.batch_push(next, pos as _, &[0], true); + pos += 1; } } } // TODO remove this diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 33614059368..7eae8315833 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -30,7 +30,7 @@ struct Args { /// Number of threads to use for inference. #[clap(default_value = "1", long, env)] - n_threads: isize, + n_threads: usize, #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. @@ -77,8 +77,8 @@ struct Args { // max_waiting_tokens: usize, /// Maximum number of requests per batch - #[clap(long, env)] - max_batch_size: Option, + #[clap(default_value = "1", long, env)] + max_batch_size: usize, /// The IP address to listen on #[clap(default_value = "0.0.0.0", long, env)] @@ -145,12 +145,10 @@ async fn main() -> Result<(), RouterError> { "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); } - if let Some(max_batch_size) = args.max_batch_size { - if max_batch_size * args.max_total_tokens > args.max_batch_total_tokens { - return Err(RouterError::ArgumentValidation( - "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), - )); - } + if args.max_batch_size * args.max_total_tokens > args.max_batch_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), + )); } if args.max_batch_total_tokens > args.n_ctx { return Err(RouterError::ArgumentValidation( From f38874798571c7e7d4ad0279353913ec39c9e65a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 09:50:57 +0000 Subject: [PATCH 07/63] Add GPU args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 36 ++++++++++++++++++++++++++++++++ backends/llamacpp/src/main.rs | 12 ++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 0d3c3950952..d7bc31de3f3 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -20,6 +20,28 @@ use tokio::time::{Duration, Instant, timeout}; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, info, warn, error, trace}; use tracing::{instrument}; +use std::str::FromStr; + +#[derive(Debug, Clone, Copy)] +pub enum LlamacppSplitMode { + GPU(usize), + Layer, + Row, +} + +impl FromStr for LlamacppSplitMode { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "layer" => Ok(LlamacppSplitMode::Layer), + "row" => Ok(LlamacppSplitMode::Row), + _ => match s.parse::() { + Ok(n) => Ok(LlamacppSplitMode::GPU(n)), + Err(_) => Err(format!("Choose a GPU number or `layer` or `row`")), + } + } + } +} pub struct LlamacppConfig { pub model_gguf: String, @@ -28,6 +50,8 @@ pub struct LlamacppConfig { pub max_batch_size: usize, pub batch_timeout: Duration, pub n_threads: usize, + pub n_gpu_layers: usize, + pub split_mode: LlamacppSplitMode, pub use_mmap: bool, pub use_mlock: bool, pub flash_attention: bool, @@ -116,6 +140,18 @@ impl Llamacpp { let model = unsafe { let mut params = bindings::llama_model_default_params(); + params.n_gpu_layers = conf.n_gpu_layers as _; + params.split_mode = match conf.split_mode { + LlamacppSplitMode::GPU(_) => bindings::LLAMA_SPLIT_MODE_NONE, + LlamacppSplitMode::Layer => bindings::LLAMA_SPLIT_MODE_LAYER, + LlamacppSplitMode::Row => bindings::LLAMA_SPLIT_MODE_ROW, + }; + params.main_gpu = match conf.split_mode { + LlamacppSplitMode::GPU(n) => n as _, + _ => 0, + }; + info!(?params.split_mode); + info!(?params.main_gpu); params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; bindings::llama_model_load_from_file(gguf.as_ptr(), params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 7eae8315833..fe7c1cd1b36 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,6 +1,6 @@ mod backend; -use backend::{LlamacppConfig, LlamacppBackend, BackendError}; +use backend::{LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; use clap::{Parser}; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; @@ -32,6 +32,14 @@ struct Args { #[clap(default_value = "1", long, env)] n_threads: usize, + /// Number of layers to store in VRAM. + #[clap(default_value = "0", long, env)] + n_gpu_layers: usize, + + /// Split the model across multiple GPUs. + #[clap(default_value = "Layer", value_enum, long, env)] + split_mode: LlamacppSplitMode, + #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. use_mmap: bool, @@ -178,6 +186,8 @@ async fn main() -> Result<(), RouterError> { model_gguf: args.model_gguf, n_ctx: args.n_ctx, n_threads: args.n_threads, + n_gpu_layers: args.n_gpu_layers, + split_mode: args.split_mode, use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, From e07835c5b5c137541e3e6a74abc8be6651a0d4e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 10:38:34 +0000 Subject: [PATCH 08/63] Add --defrag-threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 2 ++ backends/llamacpp/src/main.rs | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index d7bc31de3f3..53f2c098375 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -52,6 +52,7 @@ pub struct LlamacppConfig { pub n_threads: usize, pub n_gpu_layers: usize, pub split_mode: LlamacppSplitMode, + pub defrag_threshold: f32, pub use_mmap: bool, pub use_mlock: bool, pub flash_attention: bool, @@ -167,6 +168,7 @@ impl Llamacpp { params.n_seq_max = conf.max_batch_size as _; params.n_threads = conf.n_threads as _; params.n_threads_batch = conf.n_threads as _; // TODO ? + params.defrag_thold = conf.defrag_threshold; params.flash_attn = conf.flash_attention; params.no_perf = true; bindings::llama_init_from_model(model, params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index fe7c1cd1b36..53a83aa1d6c 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -40,6 +40,10 @@ struct Args { #[clap(default_value = "Layer", value_enum, long, env)] split_mode: LlamacppSplitMode, + /// Defragment the KV cache if holes/size > threshold. + #[clap(default_value = "-1.0", long, env)] + defrag_threshold: f32, + #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. use_mmap: bool, @@ -188,6 +192,7 @@ async fn main() -> Result<(), RouterError> { n_threads: args.n_threads, n_gpu_layers: args.n_gpu_layers, split_mode: args.split_mode, + defrag_threshold: args.defrag_threshold, use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, From d6ded897a888373500040e0f013247bca3081b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 12:44:09 +0000 Subject: [PATCH 09/63] Add a stupid batch mechanism MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 223 ++++++++++++++++--------------- backends/llamacpp/src/main.rs | 2 +- 2 files changed, 119 insertions(+), 106 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 53f2c098375..ba5ca186786 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -202,7 +202,7 @@ impl Llamacpp { Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch}) } - fn batch_clear_logits(&mut self) { + fn _batch_clear_logits(&mut self) { for n in 0..self.batch.n_tokens as usize{ unsafe { *self.batch.logits.add(n) = 0 as i8; @@ -214,24 +214,15 @@ impl Llamacpp { &mut self, token: bindings::llama_token, pos: bindings::llama_pos, - seq_ids: &[bindings::llama_seq_id], + seq_id: bindings::llama_seq_id, logits: bool, ) { - debug!("push {token} {pos} {logits}"); - // TODO check evertyhing.. let n = self.batch.n_tokens as usize; - unsafe { *self.batch.token.add(n) = token; *self.batch.pos.add(n) = pos; - *self.batch.n_seq_id.add(n) = seq_ids.len() as i32; - } - for (i, &seq_id) in seq_ids.iter().enumerate() { - unsafe { - *(*self.batch.seq_id.add(n)).add(i) = seq_id; - } - } - unsafe { + *self.batch.n_seq_id.add(n) = 1; + *(*self.batch.seq_id.add(n)).add(0) = seq_id; *self.batch.logits.add(n) = logits as i8; } self.batch.n_tokens += 1; @@ -375,6 +366,17 @@ impl Drop for LlamacppSampler { } } +struct LlamacppSeq { + id: usize, + batch_pos: usize, + token: bindings::llama_token, + pos: bindings::llama_pos, + sampler: LlamacppSampler, + text: String, + n_new_tokens: usize, + running: bool, +} + static INIT: Once = Once::new(); impl LlamacppBackend { @@ -397,7 +399,7 @@ impl LlamacppBackend { spawn(async move { let mut n_tokens = 0; - let mut requests = Vec::new(); + let mut requests = Vec::with_capacity(conf.max_batch_size); loop { match timeout(conf.batch_timeout, rx.recv()).await { @@ -442,120 +444,131 @@ impl LlamacppBackend { let _ = status_tx.send(true); while let Ok(requests) = sync_rx.recv() { + let start_time = Instant::now(); + let mut seqs: Vec = Vec::with_capacity(requests.len()); + llamacpp.batch.n_tokens = 0; - // TODO: do a real batch - for (_seq_id, request) in requests.iter().enumerate() { - + for (seq_id, request) in requests.iter().enumerate() { debug!("Request: {:?}", request); - let start_time = Instant::now(); - llamacpp.batch.n_tokens = 0; - + let sampler = match LlamacppSampler::new(&request) { + Some(sampler) => sampler, + _ => { + let _ = request.tx.send(Err(InferError::IncompleteGeneration)); + continue; + }, + }; for (pos, &token_id) in request.input_ids.iter().enumerate() { llamacpp.batch_push( token_id as bindings::llama_token, pos as bindings::llama_pos, - &[/* seq_id */ 0 as bindings::llama_seq_id], - true, + seq_id as bindings::llama_seq_id, + true, // TODO ); } - let mut pos = request.input_ids.len(); - - // TODO: close this loop :) - - // TODO: move up for perf ? - let sampler = match LlamacppSampler::new(&request) { - Some(sampler) => sampler, - _ => { - let _ = request.tx.send(Err(InferError::IncompleteGeneration)); - continue; - }, - }; - let mut text = String::with_capacity(1024); - let mut n_tokens: usize = 0; - let mut n_new_tokens: usize = 0; - + seqs.push(LlamacppSeq { + id: seq_id, + batch_pos: llamacpp.batch.n_tokens as usize - 1, + token: -1, + pos: request.input_ids.len() as _, + sampler: sampler, + text: String::with_capacity(1024), + n_new_tokens: 0, + running: true, + }); + } loop { - match unsafe { bindings::llama_decode(llamacpp.ctx, llamacpp.batch) } { - 0 => { }, - 1 => { - unsafe { - // TODO: seq_rm & seq_add if model is compatible - bindings::llama_kv_cache_clear(llamacpp.ctx); - } - let _ = request.tx.send(Err(InferError::IncompleteGeneration)); - break; - }, - _ => { - debug!("decode return <0"); - let _ = request.tx.send(Err(InferError::IncompleteGeneration)); - break; - }, + if llamacpp.batch.n_tokens == 0 { + break; + } + let decode = unsafe { + bindings::llama_decode(llamacpp.ctx, llamacpp.batch) }; - let idx = llamacpp.batch.n_tokens as usize - 1; - let (next, logprob) = sampler.sample(&mut llamacpp, idx); - n_new_tokens += 1; - - debug!("tokens: {n_tokens} new: {n_new_tokens}"); + if decode != 0 { + error!("Failed to decode batch: {decode}"); + if decode == 1 { + unsafe { + bindings::llama_kv_cache_clear(llamacpp.ctx); // TODO + } + } + for seq in seqs.iter_mut() { + let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); + seq.running = false; + } + break; + } let kv_cache_used_cells = unsafe { bindings::llama_get_kv_cache_used_cells(llamacpp.ctx) }; - let piece = match tokenizer.decode(&[next as u32], false) { - Ok(piece) => piece, - Err(e) => { - error!("Failed to decode token: {e}"); - let _ = request.tx.send(Err(InferError::IncompleteGeneration)); - break; - }, - }; - let special = vocab.is_special_token(&piece); + for seq in seqs.iter_mut() { + let (next, logprob) = seq.sampler.sample(&mut llamacpp, seq.batch_pos); + seq.n_new_tokens += 1; + seq.token = next; + + let piece = match tokenizer.decode(&[next as u32], false) { + Ok(piece) => piece, + Err(e) => { + error!("Failed to decode token: {e}"); + let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); + seq.running = false; + break; + }, + }; + let special = vocab.is_special_token(&piece); - if !special { - text.push_str(&piece); - } - let token = Token { - id: next as _, - text: piece, - logprob: logprob, - special: special, - }; - let finish: Option = { - if unsafe { bindings::llama_vocab_is_eog(llamacpp.vocab, next) } { - Some(FinishReason::EndOfSequenceToken) - } else if n_new_tokens == request.max_new_tokens { - Some(FinishReason::Length) - } else if kv_cache_used_cells == llamacpp.n_ctx as i32 { - Some(FinishReason::Length) // TODO: check - } else { - None + if !special { + seq.text.push_str(&piece); } - }; - if let Some(reason) = finish { - let _ = request.tx.send(Ok(InferStreamResponse::End { + let token = Token { + id: next as _, + text: piece, + logprob: logprob, + special: special, + }; + let finish: Option = { + if unsafe { bindings::llama_vocab_is_eog(llamacpp.vocab, next) } { + Some(FinishReason::EndOfSequenceToken) + } else if seq.n_new_tokens == requests[seq.id].max_new_tokens { + Some(FinishReason::Length) + } else if kv_cache_used_cells == llamacpp.n_ctx as i32 { + Some(FinishReason::Length) // TODO: check + } else { + None + } + }; + if let Some(reason) = finish { + let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End { + token: token, + top_tokens: vec![], + generated_text: GeneratedText { + text: seq.text.clone(), + generated_tokens: seq.n_new_tokens as _, + finish_reason: reason, + seed: Some(requests[seq.id].seed as _), + }, + start: start_time, + queued: requests[seq.id].time, + })); + seq.running = false; + break; + } + let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate { token: token, top_tokens: vec![], - generated_text: GeneratedText { - text: text, - generated_tokens: n_new_tokens as _, - finish_reason: reason, - seed: Some(request.seed as _), - }, - start: start_time, - queued: request.time, })); - break; } - let _ = request.tx.send(Ok(InferStreamResponse::Intermediate { - token: token, - top_tokens: vec![], - })); + // generate a new batch llamacpp.batch.n_tokens = 0; - // llamacpp.batch_clear_logits(); - llamacpp.batch_push(next, pos as _, &[0], true); - pos += 1; + + for seq in seqs.iter_mut() { + if seq.running { + llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); + seq.batch_pos = 0; + seq.pos += 1; + } + } } } - } // TODO remove this }); (Self{tx, status: status_rx}, ok_rx) } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 53a83aa1d6c..e1edd72dfff 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -198,7 +198,7 @@ async fn main() -> Result<(), RouterError> { flash_attention: args.flash_attention, max_batch_total_tokens: args.max_batch_total_tokens, max_batch_size: args.max_batch_size, - batch_timeout: tokio::time::Duration::from_millis(100), + batch_timeout: tokio::time::Duration::from_millis(5), }, tokenizer, ); From 390f0ec06159aacdcfb88c8437e0ce9db98685fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 15:00:23 +0000 Subject: [PATCH 10/63] Cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 2 -- backends/llamacpp/src/main.rs | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index ba5ca186786..bf45a67f472 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -151,8 +151,6 @@ impl Llamacpp { LlamacppSplitMode::GPU(n) => n as _, _ => 0, }; - info!(?params.split_mode); - info!(?params.main_gpu); params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; bindings::llama_model_load_from_file(gguf.as_ptr(), params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index e1edd72dfff..5fb23d1707c 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -37,19 +37,19 @@ struct Args { n_gpu_layers: usize, /// Split the model across multiple GPUs. - #[clap(default_value = "Layer", value_enum, long, env)] + #[clap(default_value = "Layer", long, env)] split_mode: LlamacppSplitMode, /// Defragment the KV cache if holes/size > threshold. #[clap(default_value = "-1.0", long, env)] defrag_threshold: f32, - #[clap(default_value = "true", long, env)] /// Whether to use memory mapping. + #[clap(default_value = "true", long, env)] use_mmap: bool, - #[clap(default_value = "false", long, env)] /// Whether to use memory locking. + #[clap(default_value = "false", long, env)] use_mlock: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) From 7a3ed4171e1a10da3a1012397bae0d4a92e660af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 15:09:29 +0000 Subject: [PATCH 11/63] Add --numa MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 18 +++++++++++++++++- backends/llamacpp/src/main.rs | 7 ++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index bf45a67f472..63a937574d8 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -43,6 +43,15 @@ impl FromStr for LlamacppSplitMode { } } +#[derive(Debug, Clone, Copy, clap::ValueEnum)] +pub enum LlamacppNuma { + Disabled, + Distribute, + Isolate, + Numactl, + Mirror, +} + pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: usize, @@ -52,6 +61,7 @@ pub struct LlamacppConfig { pub n_threads: usize, pub n_gpu_layers: usize, pub split_mode: LlamacppSplitMode, + pub numa: LlamacppNuma, pub defrag_threshold: f32, pub use_mmap: bool, pub use_mlock: bool, @@ -387,7 +397,13 @@ impl LlamacppBackend { INIT.call_once(|| unsafe { bindings::llama_log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); bindings::llama_backend_init(); - bindings::llama_numa_init(bindings::GGML_NUMA_STRATEGY_NUMACTL); // TODO add option & test + bindings::llama_numa_init(match conf.numa { + LlamacppNuma::Disabled => bindings::GGML_NUMA_STRATEGY_DISABLED, + LlamacppNuma::Distribute => bindings::GGML_NUMA_STRATEGY_DISTRIBUTE, + LlamacppNuma::Isolate => bindings::GGML_NUMA_STRATEGY_ISOLATE, + LlamacppNuma::Numactl => bindings::GGML_NUMA_STRATEGY_NUMACTL, + LlamacppNuma::Mirror => bindings::GGML_NUMA_STRATEGY_MIRROR, + }); }); let (status_tx, status_rx) = watch::channel(false); diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 5fb23d1707c..a8283c132b0 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,6 +1,6 @@ mod backend; -use backend::{LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; +use backend::{LlamacppNuma, LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; use clap::{Parser}; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; @@ -44,6 +44,10 @@ struct Args { #[clap(default_value = "-1.0", long, env)] defrag_threshold: f32, + /// Setup NUMA optimizations. + #[clap(default_value = "Disabled", value_enum, long, env)] + numa: LlamacppNuma, + /// Whether to use memory mapping. #[clap(default_value = "true", long, env)] use_mmap: bool, @@ -193,6 +197,7 @@ async fn main() -> Result<(), RouterError> { n_gpu_layers: args.n_gpu_layers, split_mode: args.split_mode, defrag_threshold: args.defrag_threshold, + numa: args.numa, use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, From 3f199134f025ff69a84d6776612c6dd143211946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 15:51:28 +0000 Subject: [PATCH 12/63] Fix args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index a8283c132b0..4afa64e9079 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -37,7 +37,7 @@ struct Args { n_gpu_layers: usize, /// Split the model across multiple GPUs. - #[clap(default_value = "Layer", long, env)] + #[clap(default_value = "layer", long, env)] split_mode: LlamacppSplitMode, /// Defragment the KV cache if holes/size > threshold. @@ -45,7 +45,7 @@ struct Args { defrag_threshold: f32, /// Setup NUMA optimizations. - #[clap(default_value = "Disabled", value_enum, long, env)] + #[clap(default_value = "disabled", value_enum, long, env)] numa: LlamacppNuma, /// Whether to use memory mapping. From ae5bb789c29410a8e6376349eeca88fcd1c60615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 16:07:10 +0000 Subject: [PATCH 13/63] Enable flash attention by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 4afa64e9079..1c7c5e4cf5a 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -57,7 +57,7 @@ struct Args { use_mlock: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) - #[clap(default_value = "false", long, env)] + #[clap(default_value = "true", long, env)] flash_attention: bool, /// TODO From e88a527fcffc2691afadda14b03d0b100dbb288f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 16:23:22 +0000 Subject: [PATCH 14/63] Add --offload-kqv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 2 ++ backends/llamacpp/src/main.rs | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 63a937574d8..6f8cc59d90a 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -65,6 +65,7 @@ pub struct LlamacppConfig { pub defrag_threshold: f32, pub use_mmap: bool, pub use_mlock: bool, + pub offload_kqv: bool, pub flash_attention: bool, } @@ -177,6 +178,7 @@ impl Llamacpp { params.n_threads = conf.n_threads as _; params.n_threads_batch = conf.n_threads as _; // TODO ? params.defrag_thold = conf.defrag_threshold; + params.offload_kqv = conf.offload_kqv; params.flash_attn = conf.flash_attention; params.no_perf = true; bindings::llama_init_from_model(model, params) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 1c7c5e4cf5a..b5eec467761 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -56,6 +56,10 @@ struct Args { #[clap(default_value = "false", long, env)] use_mlock: bool, + /// Enable offloading of KQV operations to the GPU. + #[clap(default_value = "false", long, env)] + offload_kqv: bool, + /// Enable flash attention for faster inference. (EXPERIMENTAL) #[clap(default_value = "true", long, env)] flash_attention: bool, @@ -201,6 +205,7 @@ async fn main() -> Result<(), RouterError> { use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, + offload_kqv: args.offload_kqv, max_batch_total_tokens: args.max_batch_total_tokens, max_batch_size: args.max_batch_size, batch_timeout: tokio::time::Duration::from_millis(5), From f38c34aeb794a4757875ae9caf081d1e0b36296f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 31 Jan 2025 18:20:45 +0000 Subject: [PATCH 15/63] Fix batch_pos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 6f8cc59d90a..2ad0e49175a 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -226,7 +226,7 @@ impl Llamacpp { pos: bindings::llama_pos, seq_id: bindings::llama_seq_id, logits: bool, - ) { + ) -> usize { let n = self.batch.n_tokens as usize; unsafe { *self.batch.token.add(n) = token; @@ -236,6 +236,7 @@ impl Llamacpp { *self.batch.logits.add(n) = logits as i8; } self.batch.n_tokens += 1; + n } // useless ? @@ -578,8 +579,7 @@ impl LlamacppBackend { for seq in seqs.iter_mut() { if seq.running { - llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); - seq.batch_pos = 0; + seq.batch_pos = llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); seq.pos += 1; } } From 960c12bd6e4e57132ab39e4d1dee0df8e54b7506 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 31 Jan 2025 22:13:59 +0100 Subject: [PATCH 16/63] backend(llama): add CUDA Dockerfile_llamacpp for now --- Dockerfile_llamacpp | 95 ++++++++++++++++-------------- backends/llamacpp/build.rs | 24 +++++++- backends/llamacpp/requirements.txt | 2 + 3 files changed, 76 insertions(+), 45 deletions(-) create mode 100644 backends/llamacpp/requirements.txt diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 006a320412f..2c5b70cbd08 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,7 +1,10 @@ -FROM ubuntu:24.04 AS base +ARG llama_version=b4599 +ARG llama_hardware_target=cpu + +FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS base ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y \ +RUN apt update && apt install -y \ python3-venv \ python3-pip @@ -10,68 +13,72 @@ ENV PATH="/venv/bin:$PATH" RUN pip3 install --no-cache-dir transformers FROM base AS deps -WORKDIR /deps +WORKDIR /opt/src -RUN apt-get install -y \ - clang cmake git +RUN apt install -y \ + clang \ + cmake \ + curl \ + git \ + libssl-dev \ + pkg-config \ + tar -# nvidia-cuda-toolkit -# -DGGML_CUDA=ON \ +FROM deps AS llamacpp-builder +ARG llama_version +ENV LLAMA_VERSION=${llama_version} -ENV LLAMA_VERSION=b4599 -RUN git clone --depth 1 -b ${LLAMA_VERSION} https://github.com/ggerganov/llama.cpp \ - && cd llama.cpp \ - && cmake -B build \ - -DCMAKE_INSTALL_PREFIX=/usr \ - -DCMAKE_INSTALL_LIBDIR=/usr/lib \ +ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz /opt/src/ +RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \ + cd llama.cpp-${LLAMA_VERSION} && \ + cmake \ + -B build \ + -DCMAKE_INSTALL_PREFIX=/usr/llama \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ + -DGGML_CUDA=1 \ -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=OFF \ - && cmake --build build --config Release -j \ + && cmake --build build --parallel --config Release -j \ && cmake --install build -# ENV MIMALLOC_VERSION=v3.0.1 -# RUN git clone --depth 1 -b ${MIMALLOC_VERSION} https://github.com/microsoft/mimalloc \ -# && cd mimalloc \ -# && cmake -B build \ -# -DCMAKE_INSTALL_PREFIX=/usr \ -# -DCMAKE_INSTALL_LIBDIR=/usr/lib \ -# -DCMAKE_C_COMPILER=clang \ -# -DCMAKE_CXX_COMPILER=clang++ \ -# && cmake --build build --config Release -j \ -# && cmake --install build - -RUN apt-get install -y \ - curl pkg-config libssl-dev - -WORKDIR /app +FROM deps AS rust-builder COPY rust-toolchain.toml rust-toolchain.toml RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none ENV PATH="/root/.cargo/bin:$PATH" -RUN cargo install cargo-chef --locked -FROM deps AS planner COPY . . -RUN cargo chef prepare --recipe-path recipe.json +COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/ +COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/ + -FROM deps AS builder -COPY --from=planner /app/recipe.json recipe.json -RUN cargo chef cook \ - --recipe-path recipe.json \ +ARG llama_hardware_target +ENV TGI_LLAMA_HARDWARE_TARGET=${llama_hardware_target} +RUN export TGI_LIB_SEARCH_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs && \ + ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ + cargo build \ --profile release-opt \ --package text-generation-router-llamacpp -COPY . . -RUN cargo build \ - --profile release-opt \ - --package text-generation-router-llamacpp --frozen -FROM base AS runtime +FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 +WORKDIR /usr/bin + +ENV DEBIAN_FRONTEND=noninteractive +ENV PATH="/venv/bin:$PATH" + +RUN apt update && apt install -y \ + openssl \ + python3-venv \ + python3-pip + +RUN python3 -m venv /venv && \ + pip3 install --no-cache-dir -r backends/llamacpp/requirements.txt -COPY --from=deps /usr/lib/libllama.so /usr/lib/ -COPY --from=deps /usr/lib/libggml*.so /usr/lib/ -COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /bin/text-generation-launcher +COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/ +COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/ +COPY --from=llamacpp-builder /usr/llama/bin/ /usr/bin/ +COPY --from=rust-builder /opt/src/target/release-opt/text-generation-router-llamacpp /usr/bin/text-generation-launcher ENTRYPOINT ["text-generation-launcher"] diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 844da32109d..2603f4e7542 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -1,7 +1,29 @@ +use std::collections::HashMap; use std::env; use std::path::PathBuf; +fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardware: &str) { + let hardware_targets = HashMap::from([("cpu", None), ("cuda", Some(vec!["cuda"]))]); + + if let Some(lib_search_path) = lib_search_path { + lib_search_path.split(":").for_each(|path| { + println!("cargo:rustc-link-search=dependency={path}"); + }); + } + + if let Some(hardware_transient_deps) = hardware_targets.get(lib_target_hardware) { + if let Some(additional_transient_deps) = hardware_transient_deps { + additional_transient_deps.iter().for_each(|dep| { + println!("cargo:rustc-link-lib={dep}"); + }); + } + } +} + fn main() { + let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH"); + let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu"); + let bindings = bindgen::Builder::default() .header("src/wrapper.h") .prepend_enum_name(false) @@ -16,5 +38,5 @@ fn main() { pkg_config::Config::new().probe("llama").unwrap(); - println!("cargo::rerun-if-changed=build.rs"); + inject_transient_dependencies(lib_search_path, lib_target_hardware); } diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt new file mode 100644 index 00000000000..d7cff7bd0c1 --- /dev/null +++ b/backends/llamacpp/requirements.txt @@ -0,0 +1,2 @@ +transformers==4.48.2 +huggingface-hub==0.28.1 \ No newline at end of file From 161280f3136e643481580536099d3b70752e19c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 10:51:44 +0000 Subject: [PATCH 17/63] Only export the latest logits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 2ad0e49175a..f95157f5445 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -474,19 +474,21 @@ impl LlamacppBackend { continue; }, }; + let last_pos = request.input_ids.len() - 1; + for (pos, &token_id) in request.input_ids.iter().enumerate() { llamacpp.batch_push( token_id as bindings::llama_token, pos as bindings::llama_pos, seq_id as bindings::llama_seq_id, - true, // TODO + pos == last_pos, // check samplers ); } seqs.push(LlamacppSeq { id: seq_id, batch_pos: llamacpp.batch.n_tokens as usize - 1, token: -1, - pos: request.input_ids.len() as _, + pos: last_pos as bindings::llama_pos + 1, sampler: sampler, text: String::with_capacity(1024), n_new_tokens: 0, From 2a51e415ff4bba5f2d5252a502119eb123c51a14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 11:37:14 +0000 Subject: [PATCH 18/63] Output real logprobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index f95157f5445..ebb403807ee 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -364,7 +364,7 @@ impl LlamacppSampler { bindings::llama_sampler_apply(self.chain, &mut view); let logprob = *view.data.offset(view.selected as _); bindings::llama_sampler_accept(self.chain, logprob.id); - (logprob.id, logprob.logit) // maybe p.ln() ? + (logprob.id, logprob.p.ln()) } } } From 96434a1e7e65ca011051ce661ae6eb9afea88399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 16:09:51 +0000 Subject: [PATCH 19/63] Fix batching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index ebb403807ee..c07f0812f36 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -429,12 +429,15 @@ impl LlamacppBackend { requests = Vec::new(); continue; } - if n_tokens + request.input_ids.len() > conf.max_batch_total_tokens as usize { + let n_tokens_to_add = request.input_ids.len(); + + if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens as usize { let _ = sync_tx.send(requests); - n_tokens = request.input_ids.len(); + n_tokens = n_tokens_to_add; requests = vec![request]; continue; } + n_tokens += n_tokens_to_add; requests.push(request); }, Err(_) => { @@ -487,7 +490,7 @@ impl LlamacppBackend { seqs.push(LlamacppSeq { id: seq_id, batch_pos: llamacpp.batch.n_tokens as usize - 1, - token: -1, + token: bindings::LLAMA_TOKEN_NULL, pos: last_pos as bindings::llama_pos + 1, sampler: sampler, text: String::with_capacity(1024), From 27534d8ee473f89c18b6d7fd0afab6261a6e5ddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 17:55:00 +0000 Subject: [PATCH 20/63] Fix seq iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index c07f0812f36..ca41f302efc 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -470,6 +470,7 @@ impl LlamacppBackend { for (seq_id, request) in requests.iter().enumerate() { debug!("Request: {:?}", request); + // TODO remove this let sampler = match LlamacppSampler::new(&request) { Some(sampler) => sampler, _ => { @@ -506,11 +507,9 @@ impl LlamacppBackend { bindings::llama_decode(llamacpp.ctx, llamacpp.batch) }; if decode != 0 { - error!("Failed to decode batch: {decode}"); - if decode == 1 { unsafe { - bindings::llama_kv_cache_clear(llamacpp.ctx); // TODO + bindings::llama_kv_cache_clear(llamacpp.ctx); // TODO: remove this ? } } for seq in seqs.iter_mut() { @@ -523,6 +522,9 @@ impl LlamacppBackend { bindings::llama_get_kv_cache_used_cells(llamacpp.ctx) }; for seq in seqs.iter_mut() { + if !seq.running { + continue; + } let (next, logprob) = seq.sampler.sample(&mut llamacpp, seq.batch_pos); seq.n_new_tokens += 1; seq.token = next; @@ -533,7 +535,7 @@ impl LlamacppBackend { error!("Failed to decode token: {e}"); let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); seq.running = false; - break; + continue; }, }; let special = vocab.is_special_token(&piece); @@ -572,7 +574,7 @@ impl LlamacppBackend { queued: requests[seq.id].time, })); seq.running = false; - break; + continue; } let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate { token: token, From c8505fb300735da0f09eff06f4dcd9ecb62ad78c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 18:33:26 +0000 Subject: [PATCH 21/63] Auto-detect n_threads when not provided MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Cargo.lock | 1 + backends/llamacpp/Cargo.toml | 1 + backends/llamacpp/src/main.rs | 10 +++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 73ed43c6479..902fe7e36e9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4643,6 +4643,7 @@ dependencies = [ "async-trait", "bindgen 0.71.1", "clap 4.5.27", + "num_cpus", "pkg-config", "text-generation-router", "thiserror 2.0.11", diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index b1ff3c3fc7f..18c2ed0a80b 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -12,6 +12,7 @@ pkg-config = "0.3.31" [dependencies] async-trait = "0.1.85" clap = "4.5.27" +num_cpus = "1.16.0" text-generation-router = { path = "../../router" } thiserror = "2.0.11" tokenizers.workspace = true diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index b5eec467761..f3e8178281f 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -29,8 +29,8 @@ struct Args { n_ctx: usize, /// Number of threads to use for inference. - #[clap(default_value = "1", long, env)] - n_threads: usize, + #[clap(long, env)] + n_threads: Option, /// Number of layers to store in VRAM. #[clap(default_value = "0", long, env)] @@ -155,6 +155,10 @@ async fn main() -> Result<(), RouterError> { args.json_output ); + let n_threads = match args.n_threads { + Some(0) | None => num_cpus::get(), + Some(threads) => threads, + }; if args.max_input_tokens >= args.max_total_tokens { return Err(RouterError::ArgumentValidation( "`max_input_tokens` must be < `max_total_tokens`".to_string(), @@ -197,7 +201,7 @@ async fn main() -> Result<(), RouterError> { LlamacppConfig { model_gguf: args.model_gguf, n_ctx: args.n_ctx, - n_threads: args.n_threads, + n_threads: n_threads, n_gpu_layers: args.n_gpu_layers, split_mode: args.split_mode, defrag_threshold: args.defrag_threshold, From 8ed362d03a58811e03064508f5268fc7b0f0498c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 20:20:43 +0000 Subject: [PATCH 22/63] Clear request cache after completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index ca41f302efc..e2fe84e8b6f 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -507,10 +507,10 @@ impl LlamacppBackend { bindings::llama_decode(llamacpp.ctx, llamacpp.batch) }; if decode != 0 { - if decode == 1 { - unsafe { - bindings::llama_kv_cache_clear(llamacpp.ctx); // TODO: remove this ? - } + warn!("llama_decode failed: kv cache clear + sync"); + unsafe { + bindings::llama_kv_cache_clear(llamacpp.ctx); + bindings::llama_synchronize(llamacpp.ctx); } for seq in seqs.iter_mut() { let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); @@ -588,6 +588,10 @@ impl LlamacppBackend { if seq.running { seq.batch_pos = llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); seq.pos += 1; + } else { + unsafe { + bindings::llama_kv_cache_seq_rm(llamacpp.ctx, seq.id as _, -1, -1); + } } } } From 104a968d018e6f9845f142a1165b98ac6f71601c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 20:27:31 +0000 Subject: [PATCH 23/63] Remove warmup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 34 -------------------------------- 1 file changed, 34 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index e2fe84e8b6f..5b44c4e61ed 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -238,38 +238,6 @@ impl Llamacpp { self.batch.n_tokens += 1; n } - - // useless ? - fn warmup(&self) { - let mut buf: Vec = Vec::new(); - - let bos = unsafe { - bindings::llama_vocab_bos(self.vocab) - }; - if bos != bindings::LLAMA_TOKEN_NULL { - buf.push(bos); - } - let eos = unsafe { - bindings::llama_vocab_eos(self.vocab) - }; - if eos != bindings::LLAMA_TOKEN_NULL { - buf.push(eos); - } - if buf.is_empty() { - warn!("Warmup failed: no bos/eos..."); - return; - } - let batch = unsafe { - bindings::llama_batch_get_one(buf.as_ptr() as _, buf.len() as _) - }; - if unsafe { bindings::llama_decode(self.ctx, batch) } != 0 { - error!("Warmup failed: llama_decode() returned an error"); - } - unsafe { - bindings::llama_kv_cache_clear(self.ctx); - bindings::llama_synchronize(self.ctx); - } - } } impl Drop for Llamacpp { @@ -456,8 +424,6 @@ impl LlamacppBackend { Ok(v) => { let _ = ok_tx.send(Ok(())); v }, Err(e) => { let _ = ok_tx.send(Err(e)); return; }, }; - llamacpp.warmup(); - let vocab = tokenizer.get_added_vocabulary(); // health() returns true From ea28332bb387bf6f0437f44f463e231809189409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Sat, 1 Feb 2025 20:40:59 +0000 Subject: [PATCH 24/63] Cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 5b44c4e61ed..c76f030808d 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -212,11 +212,9 @@ impl Llamacpp { Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch}) } - fn _batch_clear_logits(&mut self) { - for n in 0..self.batch.n_tokens as usize{ - unsafe { - *self.batch.logits.add(n) = 0 as i8; - } + fn clear_kv_cache(&mut self, seq_id: bindings::llama_seq_id) { + unsafe { + bindings::llama_kv_cache_seq_rm(self.ctx, seq_id, -1, -1); } } @@ -473,11 +471,8 @@ impl LlamacppBackend { bindings::llama_decode(llamacpp.ctx, llamacpp.batch) }; if decode != 0 { - warn!("llama_decode failed: kv cache clear + sync"); - unsafe { - bindings::llama_kv_cache_clear(llamacpp.ctx); - bindings::llama_synchronize(llamacpp.ctx); - } + warn!("llama_decode failed, clearing kv cache"); + llamacpp.clear_kv_cache(-1); for seq in seqs.iter_mut() { let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); seq.running = false; @@ -555,9 +550,7 @@ impl LlamacppBackend { seq.batch_pos = llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); seq.pos += 1; } else { - unsafe { - bindings::llama_kv_cache_seq_rm(llamacpp.ctx, seq.id as _, -1, -1); - } + llamacpp.clear_kv_cache(seq.id as _); } } } From e6a8d339026f6293dc16ccc9bc030ce5549f8468 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 3 Feb 2025 11:36:44 +0100 Subject: [PATCH 25/63] backend(llama): add CUDA architectures build argument for Dockerfile --- Dockerfile_llamacpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 2c5b70cbd08..48d83594cc0 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,5 +1,6 @@ ARG llama_version=b4599 ARG llama_hardware_target=cpu +ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS base @@ -26,6 +27,7 @@ RUN apt install -y \ FROM deps AS llamacpp-builder ARG llama_version +ARG llama_cuda_arch ENV LLAMA_VERSION=${llama_version} ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz /opt/src/ @@ -36,6 +38,7 @@ RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \ -DCMAKE_INSTALL_PREFIX=/usr/llama \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_ARCHITECTURES=${llama_cuda_arch} \ -DGGML_CUDA=1 \ -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_TESTS=OFF \ @@ -74,7 +77,7 @@ RUN apt update && apt install -y \ python3-pip RUN python3 -m venv /venv && \ - pip3 install --no-cache-dir -r backends/llamacpp/requirements.txt + pip3 install --no-cache-dir -r transformers COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/ COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/ From bfb8e03e9f3f5799c8d081a7490c78f36f976d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 3 Feb 2025 11:03:47 +0000 Subject: [PATCH 26/63] Add specific args for batch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 6 ++-- backends/llamacpp/src/main.rs | 50 ++++++++++++++++++++++---------- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index c76f030808d..bf4b19e3cd8 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -56,9 +56,11 @@ pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: usize, pub max_batch_total_tokens: usize, + pub max_physical_batch_total_tokens: usize, pub max_batch_size: usize, pub batch_timeout: Duration, pub n_threads: usize, + pub n_threads_batch: usize, pub n_gpu_layers: usize, pub split_mode: LlamacppSplitMode, pub numa: LlamacppNuma, @@ -173,10 +175,10 @@ impl Llamacpp { let mut params = bindings::llama_context_default_params(); params.n_ctx = conf.n_ctx as _; params.n_batch = conf.max_batch_total_tokens as _; - params.n_ubatch = conf.max_batch_total_tokens as _; // TODO ? + params.n_ubatch = conf.max_physical_batch_total_tokens as _; params.n_seq_max = conf.max_batch_size as _; params.n_threads = conf.n_threads as _; - params.n_threads_batch = conf.n_threads as _; // TODO ? + params.n_threads_batch = conf.n_threads_batch as _; params.defrag_thold = conf.defrag_threshold; params.offload_kqv = conf.offload_kqv; params.flash_attn = conf.flash_attention; diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index f3e8178281f..55881b1340b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -28,10 +28,14 @@ struct Args { #[clap(default_value = "4096", long, env)] n_ctx: usize, - /// Number of threads to use for inference. + /// Number of threads to use for generation. #[clap(long, env)] n_threads: Option, + /// Number of threads to use for batch processing. + #[clap(long, env)] + n_threads_batch: Option, + /// Number of layers to store in VRAM. #[clap(default_value = "0", long, env)] n_gpu_layers: usize, @@ -89,10 +93,14 @@ struct Args { // #[clap(default_value = "4096", long, env)] // max_batch_prefill_tokens: u32, - /// Maximum tokens within a batch + /// Maximum number of tokens that can be submitted within a batch #[clap(default_value = "4096", long, env)] max_batch_total_tokens: usize, + /// Maximum number of tokens within a batch + #[clap(long, env)] + max_physical_batch_total_tokens: Option, + // #[clap(default_value = "20", long, env)] // max_waiting_tokens: usize, @@ -159,6 +167,14 @@ async fn main() -> Result<(), RouterError> { Some(0) | None => num_cpus::get(), Some(threads) => threads, }; + let n_threads_batch = match args.n_threads_batch { + Some(0) | None => n_threads, + Some(threads) => threads, + }; + let max_physical_batch_total_tokens = match args.max_physical_batch_total_tokens { + None => args.max_batch_total_tokens, + Some(size) => size, + }; if args.max_input_tokens >= args.max_total_tokens { return Err(RouterError::ArgumentValidation( "`max_input_tokens` must be < `max_total_tokens`".to_string(), @@ -199,20 +215,22 @@ async fn main() -> Result<(), RouterError> { let (backend, ok) = LlamacppBackend::new( LlamacppConfig { - model_gguf: args.model_gguf, - n_ctx: args.n_ctx, - n_threads: n_threads, - n_gpu_layers: args.n_gpu_layers, - split_mode: args.split_mode, - defrag_threshold: args.defrag_threshold, - numa: args.numa, - use_mmap: args.use_mmap, - use_mlock: args.use_mlock, - flash_attention: args.flash_attention, - offload_kqv: args.offload_kqv, - max_batch_total_tokens: args.max_batch_total_tokens, - max_batch_size: args.max_batch_size, - batch_timeout: tokio::time::Duration::from_millis(5), + model_gguf: args.model_gguf, + n_ctx: args.n_ctx, + n_threads: n_threads, + n_threads_batch: n_threads_batch, + n_gpu_layers: args.n_gpu_layers, + split_mode: args.split_mode, + defrag_threshold: args.defrag_threshold, + numa: args.numa, + use_mmap: args.use_mmap, + use_mlock: args.use_mlock, + flash_attention: args.flash_attention, + offload_kqv: args.offload_kqv, + max_batch_total_tokens: args.max_batch_total_tokens, + max_physical_batch_total_tokens: max_physical_batch_total_tokens, + max_batch_size: args.max_batch_size, + batch_timeout: tokio::time::Duration::from_millis(5), }, tokenizer, ); From 38b33e9698cf672e36ea86306549395500d924a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 3 Feb 2025 12:39:28 +0000 Subject: [PATCH 27/63] Add --type-v & --type-k MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 79 ++++++++++++++++++++++++++++++++ backends/llamacpp/src/main.rs | 12 ++++- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index bf4b19e3cd8..04160cc4d22 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -52,6 +52,81 @@ pub enum LlamacppNuma { Mirror, } +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, clap::ValueEnum)] +pub enum LlamacppGGMLType { + F32, + F16, + Q4_0, + Q4_1, + Q5_0, + Q5_1, + Q8_0, + Q8_1, + Q2_K, + Q3_K, + Q4_K, + Q5_K, + Q6_K, + Q8_K, + IQ2_XXS, + IQ2_XS, + IQ3_XXS, + IQ1_S, + IQ4_NL, + IQ3_S, + IQ2_S, + IQ4_XS, + I8, + I16, + I32, + I64, + F64, + IQ1_M, + BF16, + TQ1_0, + TQ2_0, +} + +// TODO: macro +impl LlamacppGGMLType { + fn to_ggml_type(&self) -> bindings::ggml_type { + match self { + LlamacppGGMLType::F32 => bindings::GGML_TYPE_F32, + LlamacppGGMLType::F16 => bindings::GGML_TYPE_F16, + LlamacppGGMLType::Q4_0 => bindings::GGML_TYPE_Q4_0, + LlamacppGGMLType::Q4_1 => bindings::GGML_TYPE_Q4_1, + LlamacppGGMLType::Q5_0 => bindings::GGML_TYPE_Q5_0, + LlamacppGGMLType::Q5_1 => bindings::GGML_TYPE_Q5_1, + LlamacppGGMLType::Q8_0 => bindings::GGML_TYPE_Q8_0, + LlamacppGGMLType::Q8_1 => bindings::GGML_TYPE_Q8_1, + LlamacppGGMLType::Q2_K => bindings::GGML_TYPE_Q2_K, + LlamacppGGMLType::Q3_K => bindings::GGML_TYPE_Q3_K, + LlamacppGGMLType::Q4_K => bindings::GGML_TYPE_Q4_K, + LlamacppGGMLType::Q5_K => bindings::GGML_TYPE_Q5_K, + LlamacppGGMLType::Q6_K => bindings::GGML_TYPE_Q6_K, + LlamacppGGMLType::Q8_K => bindings::GGML_TYPE_Q8_K, + LlamacppGGMLType::IQ2_XXS => bindings::GGML_TYPE_IQ2_XXS, + LlamacppGGMLType::IQ2_XS => bindings::GGML_TYPE_IQ2_XS, + LlamacppGGMLType::IQ3_XXS => bindings::GGML_TYPE_IQ3_XXS, + LlamacppGGMLType::IQ1_S => bindings::GGML_TYPE_IQ1_S, + LlamacppGGMLType::IQ4_NL => bindings::GGML_TYPE_IQ4_NL, + LlamacppGGMLType::IQ3_S => bindings::GGML_TYPE_IQ3_S, + LlamacppGGMLType::IQ2_S => bindings::GGML_TYPE_IQ2_S, + LlamacppGGMLType::IQ4_XS => bindings::GGML_TYPE_IQ4_XS, + LlamacppGGMLType::I8 => bindings::GGML_TYPE_I8, + LlamacppGGMLType::I16 => bindings::GGML_TYPE_I16, + LlamacppGGMLType::I32 => bindings::GGML_TYPE_I32, + LlamacppGGMLType::I64 => bindings::GGML_TYPE_I64, + LlamacppGGMLType::F64 => bindings::GGML_TYPE_F64, + LlamacppGGMLType::IQ1_M => bindings::GGML_TYPE_IQ1_M, + LlamacppGGMLType::BF16 => bindings::GGML_TYPE_BF16, + LlamacppGGMLType::TQ1_0 => bindings::GGML_TYPE_TQ1_0, + LlamacppGGMLType::TQ2_0 => bindings::GGML_TYPE_TQ2_0, + } + } +} + pub struct LlamacppConfig { pub model_gguf: String, pub n_ctx: usize, @@ -69,6 +144,8 @@ pub struct LlamacppConfig { pub use_mlock: bool, pub offload_kqv: bool, pub flash_attention: bool, + pub type_k: LlamacppGGMLType, + pub type_v: LlamacppGGMLType, } #[derive(Debug)] @@ -182,6 +259,8 @@ impl Llamacpp { params.defrag_thold = conf.defrag_threshold; params.offload_kqv = conf.offload_kqv; params.flash_attn = conf.flash_attention; + params.type_k = conf.type_k.to_ggml_type(); + params.type_v = conf.type_v.to_ggml_type(); params.no_perf = true; bindings::llama_init_from_model(model, params) }; diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 55881b1340b..dba391c0891 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,6 +1,6 @@ mod backend; -use backend::{LlamacppNuma, LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; +use backend::{LlamacppNuma, LlamacppGGMLType, LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; use clap::{Parser}; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; @@ -68,6 +68,14 @@ struct Args { #[clap(default_value = "true", long, env)] flash_attention: bool, + /// Use data type for K cache. + #[clap(default_value = "f16", value_enum, long, env)] + type_k: LlamacppGGMLType, + + /// Use data type for V cache. + #[clap(default_value = "f16", value_enum, long, env)] + type_v: LlamacppGGMLType, + /// TODO #[clap(default_value = "2", long, env)] validation_workers: usize, @@ -226,6 +234,8 @@ async fn main() -> Result<(), RouterError> { use_mmap: args.use_mmap, use_mlock: args.use_mlock, flash_attention: args.flash_attention, + type_k: args.type_k, + type_v: args.type_v, offload_kqv: args.offload_kqv, max_batch_total_tokens: args.max_batch_total_tokens, max_physical_batch_total_tokens: max_physical_batch_total_tokens, From 207041a97776e081254035ee0c1b2a3eb6df133f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 3 Feb 2025 13:38:42 +0000 Subject: [PATCH 28/63] Bump llamacpp to b4623 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 48d83594cc0..139e80cedba 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,4 +1,4 @@ -ARG llama_version=b4599 +ARG llama_version=b4623 ARG llama_hardware_target=cpu ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real From d883109df68529d971d54aa648df0c3d3fca6e4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Mon, 3 Feb 2025 20:58:33 +0000 Subject: [PATCH 29/63] Disable graceful shutdown in debug mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 8 ++++++-- backends/llamacpp/src/main.rs | 12 ++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 04160cc4d22..870798e7d78 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -441,7 +441,7 @@ impl LlamacppBackend { pub fn new( conf: LlamacppConfig, tokenizer: Tokenizer, - ) -> (Self, oneshot::Receiver>) { + ) -> (Self, oneshot::Receiver>, watch::Sender) { // Setup llama & export logs, once and for all INIT.call_once(|| unsafe { @@ -457,6 +457,7 @@ impl LlamacppBackend { }); let (status_tx, status_rx) = watch::channel(false); + let (shutdown_tx, shutdown_rx) = watch::channel(false); let (ok_tx, ok_rx) = oneshot::channel(); let (tx, mut rx) = unbounded_channel::(); let (sync_tx, sync_rx) = mpsc::channel(); @@ -509,6 +510,9 @@ impl LlamacppBackend { let _ = status_tx.send(true); while let Ok(requests) = sync_rx.recv() { + if shutdown_rx.borrow().clone() { + break; + } let start_time = Instant::now(); let mut seqs: Vec = Vec::with_capacity(requests.len()); llamacpp.batch.n_tokens = 0; @@ -637,7 +641,7 @@ impl LlamacppBackend { } } }); - (Self{tx, status: status_rx}, ok_rx) + (Self{tx, status: status_rx}, ok_rx, shutdown_tx) } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index dba391c0891..1b8c4c5db13 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -6,7 +6,7 @@ use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; use tokenizers::{Tokenizer, FromPretrainedParameters}; use tokio::sync::oneshot::error::RecvError; -use tracing::error; +use tracing::{warn, error}; /// Backend Configuration #[derive(Parser, Debug)] @@ -221,7 +221,7 @@ async fn main() -> Result<(), RouterError> { )? }; - let (backend, ok) = LlamacppBackend::new( + let (backend, ok, shutdown) = LlamacppBackend::new( LlamacppConfig { model_gguf: args.model_gguf, n_ctx: args.n_ctx, @@ -246,6 +246,14 @@ async fn main() -> Result<(), RouterError> { ); ok.await??; + if cfg!(debug_assertions) { + warn!("Graceful shutdown disabled!"); + let _ = tokio::task::spawn(async move { + let _ = tokio::signal::ctrl_c().await; + let _ = shutdown.send(true); + }); + } + server::run( backend, args.max_concurrent_requests, From df2a4fbb8aa9d2ab894356b9e9d56b84b2e17595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 4 Feb 2025 12:34:02 +0000 Subject: [PATCH 30/63] Update Dockerfile_llamacpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 85 +++++++++++++++++--------------------- backends/llamacpp/build.rs | 4 ++ 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 139e80cedba..5b21124cce5 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,41 +1,27 @@ -ARG llama_version=b4623 -ARG llama_hardware_target=cpu +FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS deps + +ARG llama_version=b4628 ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real -FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS base +WORKDIR /opt/src ENV DEBIAN_FRONTEND=noninteractive RUN apt update && apt install -y \ - python3-venv \ - python3-pip - -RUN python3 -m venv /venv -ENV PATH="/venv/bin:$PATH" -RUN pip3 install --no-cache-dir transformers - -FROM base AS deps -WORKDIR /opt/src - -RUN apt install -y \ clang \ cmake \ curl \ git \ + python3-dev \ libssl-dev \ pkg-config \ tar -FROM deps AS llamacpp-builder -ARG llama_version -ARG llama_cuda_arch -ENV LLAMA_VERSION=${llama_version} - -ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz /opt/src/ -RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \ - cd llama.cpp-${LLAMA_VERSION} && \ - cmake \ - -B build \ - -DCMAKE_INSTALL_PREFIX=/usr/llama \ +ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llama_version}.tar.gz /opt/src/ +RUN tar -xzf ${llama_version}.tar.gz \ + && cd llama.cpp-${llama_version} \ + && cmake -B build \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DCMAKE_INSTALL_LIBDIR=/usr/lib \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CUDA_ARCHITECTURES=${llama_cuda_arch} \ @@ -44,44 +30,49 @@ RUN tar -xzf ${LLAMA_VERSION}.tar.gz && \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ -DLLAMA_BUILD_SERVER=OFF \ - && cmake --build build --parallel --config Release -j \ + && cmake --build build --parallel --config Release \ && cmake --install build -FROM deps AS rust-builder +WORKDIR /app COPY rust-toolchain.toml rust-toolchain.toml RUN curl -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain none ENV PATH="/root/.cargo/bin:$PATH" +RUN cargo install cargo-chef --locked +FROM deps AS planner COPY . . -COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/ -COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/ - +RUN cargo chef prepare --recipe-path recipe.json -ARG llama_hardware_target -ENV TGI_LLAMA_HARDWARE_TARGET=${llama_hardware_target} -RUN export TGI_LIB_SEARCH_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs && \ - ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \ - cargo build \ +FROM deps AS builder +COPY --from=planner /app/recipe.json recipe.json +RUN cargo chef cook \ + --recipe-path recipe.json \ --profile release-opt \ --package text-generation-router-llamacpp +COPY . . +ENV TGI_LLAMA_PKG_CUDA=cuda-12.6 +RUN cargo build \ + --profile release-opt \ + --package text-generation-router-llamacpp --frozen -FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 -WORKDIR /usr/bin +# fix libcuda.so.1 ? +RUN cp "$(pkg-config --variable=libdir cuda-12.6)"/stubs/libcuda.so /usr/lib/libcuda.so.1 -ENV DEBIAN_FRONTEND=noninteractive -ENV PATH="/venv/bin:$PATH" +FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 RUN apt update && apt install -y \ - openssl \ python3-venv \ python3-pip -RUN python3 -m venv /venv && \ - pip3 install --no-cache-dir -r transformers +RUN python3 -m venv /venv +ENV PATH="/venv/bin:$PATH" + +COPY backends/llamacpp/requirements.txt requirements.txt +RUN pip3 install --no-cache-dir -r requirements.txt -COPY --from=llamacpp-builder /usr/llama/lib/ /usr/lib/ -COPY --from=llamacpp-builder /usr/llama/include/ /usr/include/ -COPY --from=llamacpp-builder /usr/llama/bin/ /usr/bin/ -COPY --from=rust-builder /opt/src/target/release-opt/text-generation-router-llamacpp /usr/bin/text-generation-launcher +COPY --from=builder /usr/lib/libllama.so /usr/lib/ +COPY --from=builder /usr/lib/libggml*.so /usr/lib/ +COPY --from=builder /usr/lib/libcuda.so.1 /usr/lib/ +COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/ -ENTRYPOINT ["text-generation-launcher"] +ENTRYPOINT ["text-generation-router-llamacpp"] diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 2603f4e7542..e56272eeb5d 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -21,6 +21,7 @@ fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardw } fn main() { + let pkg_cuda = option_env!("TGI_LLAMA_PKG_CUDA"); let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH"); let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu"); @@ -36,6 +37,9 @@ fn main() { .write_to_file(out_path.join("bindings.rs")) .expect("Couldn't write bindings!"); + if let Some(pkg_cuda) = pkg_cuda { + pkg_config::Config::new().probe(pkg_cuda).unwrap(); + } pkg_config::Config::new().probe("llama").unwrap(); inject_transient_dependencies(lib_search_path, lib_target_hardware); From 906c265aef79b06f057ed0b1bc95eb8c7dc0dcd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 4 Feb 2025 17:53:47 +0000 Subject: [PATCH 31/63] Cleanup Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 5b21124cce5..ed8783d66f0 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -55,9 +55,6 @@ RUN cargo build \ --profile release-opt \ --package text-generation-router-llamacpp --frozen -# fix libcuda.so.1 ? -RUN cp "$(pkg-config --variable=libdir cuda-12.6)"/stubs/libcuda.so /usr/lib/libcuda.so.1 - FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 RUN apt update && apt install -y \ @@ -72,7 +69,6 @@ RUN pip3 install --no-cache-dir -r requirements.txt COPY --from=builder /usr/lib/libllama.so /usr/lib/ COPY --from=builder /usr/lib/libggml*.so /usr/lib/ -COPY --from=builder /usr/lib/libcuda.so.1 /usr/lib/ COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/ ENTRYPOINT ["text-generation-router-llamacpp"] From e007529590381e98314e2d89129f9816cc462f3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Tue, 4 Feb 2025 17:54:53 +0000 Subject: [PATCH 32/63] Update Cargo.lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 902fe7e36e9..547cff9b78f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4638,7 +4638,7 @@ dependencies = [ [[package]] name = "text-generation-router-llamacpp" -version = "3.0.2-dev0" +version = "3.1.1-dev0" dependencies = [ "async-trait", "bindgen 0.71.1", From d3a772a8dd063d3a283a27b5e7ee069a5dfedde0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 10:10:38 +0000 Subject: [PATCH 33/63] Update args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 84 +++++++++++++++++------------------ 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 1b8c4c5db13..5548773bf6a 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -20,7 +20,7 @@ struct Args { #[clap(default_value = "main", long, env)] revision: String, - /// Path to the GGUF model file to be used for inference. + /// Path to the GGUF model file for inference. #[clap(long, env)] model_gguf: String, // TODO Option() with hf->gguf & quantize @@ -48,15 +48,15 @@ struct Args { #[clap(default_value = "-1.0", long, env)] defrag_threshold: f32, - /// Setup NUMA optimizations. + /// Enable NUMA optimizations. #[clap(default_value = "disabled", value_enum, long, env)] numa: LlamacppNuma, - /// Whether to use memory mapping. + /// Use memory mapping for the model. #[clap(default_value = "true", long, env)] use_mmap: bool, - /// Whether to use memory locking. + /// Use memory locking to prevent swapping. #[clap(default_value = "false", long, env)] use_mlock: bool, @@ -68,95 +68,95 @@ struct Args { #[clap(default_value = "true", long, env)] flash_attention: bool, - /// Use data type for K cache. + /// Data type used for K cache. #[clap(default_value = "f16", value_enum, long, env)] type_k: LlamacppGGMLType, - /// Use data type for V cache. + /// Data type used for V cache. #[clap(default_value = "f16", value_enum, long, env)] type_v: LlamacppGGMLType, - /// TODO + /// Number of tokenizer workers used for payload validation and truncation. #[clap(default_value = "2", long, env)] validation_workers: usize, + + /// Maximum amount of concurrent requests. #[clap(default_value = "128", long, env)] max_concurrent_requests: usize, - #[clap(default_value = "2", long, env)] - max_best_of: usize, - #[clap(default_value = "4", long, env)] - max_stop_sequences: usize, - #[clap(default_value = "5", long, env)] - max_top_n_tokens: u32, - /// Maximum number of input tokens allowed per request. + /// Maximum number of input tokens per request. #[clap(default_value = "1024", long, env)] max_input_tokens: usize, - /// Maximum total tokens (input + output) allowed per request. + /// Maximum total tokens (input + output) per request. #[clap(default_value = "2048", long, env)] max_total_tokens: usize, -// #[clap(default_value = "1.2", long, env)] -// waiting_served_ratio: f32, -// #[clap(default_value = "4096", long, env)] -// max_batch_prefill_tokens: u32, - - /// Maximum number of tokens that can be submitted within a batch + /// Maximum number of tokens in a batch. #[clap(default_value = "4096", long, env)] max_batch_total_tokens: usize, - /// Maximum number of tokens within a batch + /// Maximum number of tokens in a physical batch. #[clap(long, env)] max_physical_batch_total_tokens: Option, -// #[clap(default_value = "20", long, env)] -// max_waiting_tokens: usize, - - /// Maximum number of requests per batch + /// Maximum number of requests per batch. #[clap(default_value = "1", long, env)] max_batch_size: usize, - /// The IP address to listen on + /// IP address to listen on. #[clap(default_value = "0.0.0.0", long, env)] hostname: String, - /// The port to listen on. + /// Port to listen on. #[clap(default_value = "3001", long, short, env)] port: u16, -// #[clap(default_value = "/tmp/text-generation-server-0", long, env)] -// master_shard_uds_path: String, -// #[clap(long, env)] -// tokenizer_name: String, -// #[clap(long, env)] -// tokenizer_config_path: Option, -// #[clap(long, env, value_enum)] -// trust_remote_code: bool, -// #[clap(long, env)] -// api_key: Option, - + /// Enable JSON output format. #[clap(long, env)] json_output: bool, + + /// OTLP endpoint for telemetry data. #[clap(long, env)] otlp_endpoint: Option, + + /// Service name for OTLP telemetry. #[clap(default_value = "text-generation-inference.router", long, env)] otlp_service_name: String, + + /// Allowed origins for CORS. #[clap(long, env)] cors_allow_origin: Option>, + + /// Enable Ngrok tunneling. #[clap(long, env)] ngrok: bool, + + /// Ngrok authentication token. #[clap(long, env)] ngrok_authtoken: Option, + + /// Ngrok edge to use for tunneling. #[clap(long, env)] ngrok_edge: Option, + + /// Path to the tokenizer configuration file. #[clap(long, env)] tokenizer_config_path: Option, + + /// Disable grammar support. #[clap(long, env, default_value_t = false)] disable_grammar_support: bool, + + /// Maximum number of inputs per request. #[clap(default_value = "4", long, env)] max_client_batch_size: usize, + + /// Level of usage statistics collection. #[clap(default_value = "on", long, env)] usage_stats: usage_stats::UsageStatsLevel, + + /// Maximum payload size limit in bytes. #[clap(default_value = "2000000", long, env)] payload_limit: usize, } @@ -257,9 +257,9 @@ async fn main() -> Result<(), RouterError> { server::run( backend, args.max_concurrent_requests, - args.max_best_of, - args.max_stop_sequences, - args.max_top_n_tokens, + 0, // max_best_of + 0, // max_stop_sequences + 0, // max_top_n_tokens args.max_input_tokens, args.max_total_tokens, args.validation_workers, From dbee80412967b6f52da138acf8b39efb81740234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 10:12:39 +0000 Subject: [PATCH 34/63] Simplify batching logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 33 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 870798e7d78..c6f4e9252c0 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -21,6 +21,7 @@ use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, info, warn, error, trace}; use tracing::{instrument}; use std::str::FromStr; +use std::mem::replace; #[derive(Debug, Clone, Copy)] pub enum LlamacppSplitMode { @@ -466,35 +467,29 @@ impl LlamacppBackend { let mut n_tokens = 0; let mut requests = Vec::with_capacity(conf.max_batch_size); + let flush = |requests: &mut Vec<_>, n_tokens: &mut usize| { + if !requests.is_empty() { + let _ = sync_tx.send(replace(requests, Vec::with_capacity(conf.max_batch_size))); + *n_tokens = 0; + } + }; loop { match timeout(conf.batch_timeout, rx.recv()).await { - Ok(None) => break, // closed Ok(Some(request)) => { - if requests.len() + 1 == conf.max_batch_size { - requests.push(request); - let _ = sync_tx.send(requests); - n_tokens = 0; - requests = Vec::new(); - continue; - } let n_tokens_to_add = request.input_ids.len(); if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens as usize { - let _ = sync_tx.send(requests); - n_tokens = n_tokens_to_add; - requests = vec![request]; - continue; + flush(&mut requests, &mut n_tokens); } n_tokens += n_tokens_to_add; requests.push(request); - }, - Err(_) => { - if !requests.is_empty() { - let _ = sync_tx.send(requests); - n_tokens = 0; - requests = Vec::new(); + + if requests.len() == conf.max_batch_size { + flush(&mut requests, &mut n_tokens); } - } + }, + Ok(None) => break, // closed + Err(_) => flush(&mut requests, &mut n_tokens), // timeout } } }); From c52f08351fa6dcc00dd09ced48166304c35fec44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 10:57:50 +0000 Subject: [PATCH 35/63] Set TGI_LLAMA_PKG_CUDA from CUDA_VERSION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index ed8783d66f0..b020778f6e0 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -2,6 +2,7 @@ FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS deps ARG llama_version=b4628 ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real +ENV TGI_LLAMA_PKG_CUDA=cuda-${CUDA_VERSION%.*} WORKDIR /opt/src @@ -50,7 +51,6 @@ RUN cargo chef cook \ --profile release-opt \ --package text-generation-router-llamacpp COPY . . -ENV TGI_LLAMA_PKG_CUDA=cuda-12.6 RUN cargo build \ --profile release-opt \ --package text-generation-router-llamacpp --frozen From 051ff2d5ce442ebb14f1abc796438a3087949341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 11:13:17 +0000 Subject: [PATCH 36/63] Rename bindings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/build.rs | 14 ++- backends/llamacpp/src/backend.rs | 198 +++++++++++++++---------------- 2 files changed, 112 insertions(+), 100 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index e56272eeb5d..1b1c3718f09 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -1,3 +1,5 @@ + +use bindgen::callbacks::{ParseCallbacks, ItemInfo}; use std::collections::HashMap; use std::env; use std::path::PathBuf; @@ -20,6 +22,15 @@ fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardw } } +#[derive(Debug)] +struct PrefixStripper; + +impl ParseCallbacks for PrefixStripper { + fn generated_name_override(&self, item_info: ItemInfo<'_>) -> Option { + item_info.name.strip_prefix("llama_").map(str::to_string) + } +} + fn main() { let pkg_cuda = option_env!("TGI_LLAMA_PKG_CUDA"); let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH"); @@ -28,13 +39,14 @@ fn main() { let bindings = bindgen::Builder::default() .header("src/wrapper.h") .prepend_enum_name(false) + .parse_callbacks(Box::new(PrefixStripper)) .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) .generate() .expect("Unable to generate bindings"); let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); bindings - .write_to_file(out_path.join("bindings.rs")) + .write_to_file(out_path.join("llamacpp.rs")) .expect("Couldn't write bindings!"); if let Some(pkg_cuda) = pkg_cuda { diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index c6f4e9252c0..aa44df31750 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,9 +1,9 @@ -mod bindings { +mod llamacpp { #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] #![allow(dead_code)] - include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + include!(concat!(env!("OUT_DIR"), "/llamacpp.rs")); } use async_trait::async_trait; use std::ffi::CString; @@ -91,39 +91,39 @@ pub enum LlamacppGGMLType { // TODO: macro impl LlamacppGGMLType { - fn to_ggml_type(&self) -> bindings::ggml_type { + fn to_ggml_type(&self) -> llamacpp::ggml_type { match self { - LlamacppGGMLType::F32 => bindings::GGML_TYPE_F32, - LlamacppGGMLType::F16 => bindings::GGML_TYPE_F16, - LlamacppGGMLType::Q4_0 => bindings::GGML_TYPE_Q4_0, - LlamacppGGMLType::Q4_1 => bindings::GGML_TYPE_Q4_1, - LlamacppGGMLType::Q5_0 => bindings::GGML_TYPE_Q5_0, - LlamacppGGMLType::Q5_1 => bindings::GGML_TYPE_Q5_1, - LlamacppGGMLType::Q8_0 => bindings::GGML_TYPE_Q8_0, - LlamacppGGMLType::Q8_1 => bindings::GGML_TYPE_Q8_1, - LlamacppGGMLType::Q2_K => bindings::GGML_TYPE_Q2_K, - LlamacppGGMLType::Q3_K => bindings::GGML_TYPE_Q3_K, - LlamacppGGMLType::Q4_K => bindings::GGML_TYPE_Q4_K, - LlamacppGGMLType::Q5_K => bindings::GGML_TYPE_Q5_K, - LlamacppGGMLType::Q6_K => bindings::GGML_TYPE_Q6_K, - LlamacppGGMLType::Q8_K => bindings::GGML_TYPE_Q8_K, - LlamacppGGMLType::IQ2_XXS => bindings::GGML_TYPE_IQ2_XXS, - LlamacppGGMLType::IQ2_XS => bindings::GGML_TYPE_IQ2_XS, - LlamacppGGMLType::IQ3_XXS => bindings::GGML_TYPE_IQ3_XXS, - LlamacppGGMLType::IQ1_S => bindings::GGML_TYPE_IQ1_S, - LlamacppGGMLType::IQ4_NL => bindings::GGML_TYPE_IQ4_NL, - LlamacppGGMLType::IQ3_S => bindings::GGML_TYPE_IQ3_S, - LlamacppGGMLType::IQ2_S => bindings::GGML_TYPE_IQ2_S, - LlamacppGGMLType::IQ4_XS => bindings::GGML_TYPE_IQ4_XS, - LlamacppGGMLType::I8 => bindings::GGML_TYPE_I8, - LlamacppGGMLType::I16 => bindings::GGML_TYPE_I16, - LlamacppGGMLType::I32 => bindings::GGML_TYPE_I32, - LlamacppGGMLType::I64 => bindings::GGML_TYPE_I64, - LlamacppGGMLType::F64 => bindings::GGML_TYPE_F64, - LlamacppGGMLType::IQ1_M => bindings::GGML_TYPE_IQ1_M, - LlamacppGGMLType::BF16 => bindings::GGML_TYPE_BF16, - LlamacppGGMLType::TQ1_0 => bindings::GGML_TYPE_TQ1_0, - LlamacppGGMLType::TQ2_0 => bindings::GGML_TYPE_TQ2_0, + LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32, + LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16, + LlamacppGGMLType::Q4_0 => llamacpp::GGML_TYPE_Q4_0, + LlamacppGGMLType::Q4_1 => llamacpp::GGML_TYPE_Q4_1, + LlamacppGGMLType::Q5_0 => llamacpp::GGML_TYPE_Q5_0, + LlamacppGGMLType::Q5_1 => llamacpp::GGML_TYPE_Q5_1, + LlamacppGGMLType::Q8_0 => llamacpp::GGML_TYPE_Q8_0, + LlamacppGGMLType::Q8_1 => llamacpp::GGML_TYPE_Q8_1, + LlamacppGGMLType::Q2_K => llamacpp::GGML_TYPE_Q2_K, + LlamacppGGMLType::Q3_K => llamacpp::GGML_TYPE_Q3_K, + LlamacppGGMLType::Q4_K => llamacpp::GGML_TYPE_Q4_K, + LlamacppGGMLType::Q5_K => llamacpp::GGML_TYPE_Q5_K, + LlamacppGGMLType::Q6_K => llamacpp::GGML_TYPE_Q6_K, + LlamacppGGMLType::Q8_K => llamacpp::GGML_TYPE_Q8_K, + LlamacppGGMLType::IQ2_XXS => llamacpp::GGML_TYPE_IQ2_XXS, + LlamacppGGMLType::IQ2_XS => llamacpp::GGML_TYPE_IQ2_XS, + LlamacppGGMLType::IQ3_XXS => llamacpp::GGML_TYPE_IQ3_XXS, + LlamacppGGMLType::IQ1_S => llamacpp::GGML_TYPE_IQ1_S, + LlamacppGGMLType::IQ4_NL => llamacpp::GGML_TYPE_IQ4_NL, + LlamacppGGMLType::IQ3_S => llamacpp::GGML_TYPE_IQ3_S, + LlamacppGGMLType::IQ2_S => llamacpp::GGML_TYPE_IQ2_S, + LlamacppGGMLType::IQ4_XS => llamacpp::GGML_TYPE_IQ4_XS, + LlamacppGGMLType::I8 => llamacpp::GGML_TYPE_I8, + LlamacppGGMLType::I16 => llamacpp::GGML_TYPE_I16, + LlamacppGGMLType::I32 => llamacpp::GGML_TYPE_I32, + LlamacppGGMLType::I64 => llamacpp::GGML_TYPE_I64, + LlamacppGGMLType::F64 => llamacpp::GGML_TYPE_F64, + LlamacppGGMLType::IQ1_M => llamacpp::GGML_TYPE_IQ1_M, + LlamacppGGMLType::BF16 => llamacpp::GGML_TYPE_BF16, + LlamacppGGMLType::TQ1_0 => llamacpp::GGML_TYPE_TQ1_0, + LlamacppGGMLType::TQ2_0 => llamacpp::GGML_TYPE_TQ2_0, } } } @@ -201,16 +201,16 @@ impl LlamacppRequest { } struct Llamacpp { - model: *mut bindings::llama_model, - ctx: *mut bindings::llama_context, - vocab: *const bindings::llama_vocab, - logprobs: Vec, - batch: bindings::llama_batch, + model: *mut llamacpp::llama_model, + ctx: *mut llamacpp::llama_context, + vocab: *const llamacpp::llama_vocab, + logprobs: Vec, + batch: llamacpp::llama_batch, n_ctx: u32, } extern "C" fn llamacpp_log_callback( - level: bindings::ggml_log_level, + level: llamacpp::ggml_log_level, msg: *const std::os::raw::c_char, _user_data: *mut std::os::raw::c_void, ) { @@ -218,10 +218,10 @@ extern "C" fn llamacpp_log_callback( let rmsg = cmsg.to_string_lossy().trim_end_matches('\n').to_string(); match level { - bindings::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg), - bindings::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg), - bindings::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg), - bindings::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg), _ => trace!(target: "llamacpp", "{}", rmsg), } } @@ -231,12 +231,12 @@ impl Llamacpp { let gguf = CString::new(conf.model_gguf)?; let model = unsafe { - let mut params = bindings::llama_model_default_params(); + let mut params = llamacpp::model_default_params(); params.n_gpu_layers = conf.n_gpu_layers as _; params.split_mode = match conf.split_mode { - LlamacppSplitMode::GPU(_) => bindings::LLAMA_SPLIT_MODE_NONE, - LlamacppSplitMode::Layer => bindings::LLAMA_SPLIT_MODE_LAYER, - LlamacppSplitMode::Row => bindings::LLAMA_SPLIT_MODE_ROW, + LlamacppSplitMode::GPU(_) => llamacpp::LLAMA_SPLIT_MODE_NONE, + LlamacppSplitMode::Layer => llamacpp::LLAMA_SPLIT_MODE_LAYER, + LlamacppSplitMode::Row => llamacpp::LLAMA_SPLIT_MODE_ROW, }; params.main_gpu = match conf.split_mode { LlamacppSplitMode::GPU(n) => n as _, @@ -244,13 +244,13 @@ impl Llamacpp { }; params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; - bindings::llama_model_load_from_file(gguf.as_ptr(), params) + llamacpp::model_load_from_file(gguf.as_ptr(), params) }; if model.is_null() { return Err(BackendError::Llamacpp("Failed to load model".to_string())) } let ctx = unsafe { - let mut params = bindings::llama_context_default_params(); + let mut params = llamacpp::context_default_params(); params.n_ctx = conf.n_ctx as _; params.n_batch = conf.max_batch_total_tokens as _; params.n_ubatch = conf.max_physical_batch_total_tokens as _; @@ -263,48 +263,48 @@ impl Llamacpp { params.type_k = conf.type_k.to_ggml_type(); params.type_v = conf.type_v.to_ggml_type(); params.no_perf = true; - bindings::llama_init_from_model(model, params) + llamacpp::init_from_model(model, params) }; if ctx.is_null() { return Err(BackendError::Llamacpp("Failed to init context".to_string())) } - let n_ctx = unsafe { bindings::llama_n_ctx(ctx) }; + let n_ctx = unsafe { llamacpp::n_ctx(ctx) }; let vocab = unsafe { - bindings::llama_model_get_vocab(model) + llamacpp::model_get_vocab(model) }; if vocab.is_null() { return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); } let n_tokens = unsafe { - bindings::llama_vocab_n_tokens(vocab) + llamacpp::vocab_n_tokens(vocab) }; let mut logprobs = Vec::with_capacity(n_tokens as usize); for token in 0..n_tokens { - logprobs.push(bindings::llama_token_data { + logprobs.push(llamacpp::llama_token_data { id: token, logit: 0.0, p: 0.0, }); } let batch = unsafe { - bindings::llama_batch_init(conf.max_batch_total_tokens as _, 0, 1) + llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) }; Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch}) } - fn clear_kv_cache(&mut self, seq_id: bindings::llama_seq_id) { + fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) { unsafe { - bindings::llama_kv_cache_seq_rm(self.ctx, seq_id, -1, -1); + llamacpp::kv_cache_seq_rm(self.ctx, seq_id, -1, -1); } } fn batch_push( &mut self, - token: bindings::llama_token, - pos: bindings::llama_pos, - seq_id: bindings::llama_seq_id, + token: llamacpp::llama_token, + pos: llamacpp::llama_pos, + seq_id: llamacpp::llama_seq_id, logits: bool, ) -> usize { let n = self.batch.n_tokens as usize; @@ -323,43 +323,43 @@ impl Llamacpp { impl Drop for Llamacpp { fn drop(&mut self) { if !self.ctx.is_null() { - unsafe { bindings::llama_free(self.ctx) }; + unsafe { llamacpp::free(self.ctx) }; } if !self.model.is_null() { - unsafe { bindings::llama_model_free(self.model) }; + unsafe { llamacpp::model_free(self.model) }; } - unsafe { bindings::llama_batch_free(self.batch) }; + unsafe { llamacpp::batch_free(self.batch) }; } } struct LlamacppSampler { - chain: *mut bindings::llama_sampler, + chain: *mut llamacpp::llama_sampler, } impl LlamacppSampler { fn new(req: &LlamacppRequest) -> Option { let chain = unsafe { - let params = bindings::llama_sampler_chain_default_params(); - bindings::llama_sampler_chain_init(params) + let params = llamacpp::sampler_chain_default_params(); + llamacpp::sampler_chain_init(params) }; if chain.is_null() { error!("Failed to init sampler"); return None; } let top_k = unsafe { - bindings::llama_sampler_init_top_k(req.top_k) + llamacpp::sampler_init_top_k(req.top_k) }; let top_p = unsafe { - bindings::llama_sampler_init_top_p(req.top_p, req.min_keep) + llamacpp::sampler_init_top_p(req.top_p, req.min_keep) }; let typical_p = unsafe { - bindings::llama_sampler_init_typical(req.typical_p, req.min_keep) + llamacpp::sampler_init_typical(req.typical_p, req.min_keep) }; let temp = unsafe { - bindings::llama_sampler_init_temp(req.temp) + llamacpp::sampler_init_temp(req.temp) }; let penalties = unsafe { - bindings::llama_sampler_init_penalties( + llamacpp::sampler_init_penalties( req.penalty_last_n, req.penalty_repeat, req.penalty_freq, @@ -367,7 +367,7 @@ impl LlamacppSampler { ) }; let dist = unsafe { - bindings::llama_sampler_init_dist(req.seed) + llamacpp::sampler_init_dist(req.seed) }; let mut failed = false; @@ -381,7 +381,7 @@ impl LlamacppSampler { error!("Failed to init {k} sampler"); failed = true; } else { - unsafe { bindings::llama_sampler_chain_add(chain, *v) }; + unsafe { llamacpp::sampler_chain_add(chain, *v) }; } } if failed { @@ -391,27 +391,27 @@ impl LlamacppSampler { } } - fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (bindings::llama_token, f32) { + fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::llama_token, f32) { let logits = unsafe { - bindings::llama_get_logits_ith(llamacpp.ctx, idx as _) + llamacpp::get_logits_ith(llamacpp.ctx, idx as _) }; for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() { - *logprob = bindings::llama_token_data { + *logprob = llamacpp::llama_token_data { id: token as _, logit: unsafe { *logits.offset(token as _) }, p: 0.0, }; } - let mut view = bindings::llama_token_data_array { + let mut view = llamacpp::llama_token_data_array { data: llamacpp.logprobs.as_mut_ptr(), size: llamacpp.logprobs.len(), selected: -1, sorted: false, }; unsafe { - bindings::llama_sampler_apply(self.chain, &mut view); + llamacpp::sampler_apply(self.chain, &mut view); let logprob = *view.data.offset(view.selected as _); - bindings::llama_sampler_accept(self.chain, logprob.id); + llamacpp::sampler_accept(self.chain, logprob.id); (logprob.id, logprob.p.ln()) } } @@ -420,7 +420,7 @@ impl LlamacppSampler { impl Drop for LlamacppSampler { fn drop(&mut self) { if !self.chain.is_null() { - unsafe { bindings::llama_sampler_free(self.chain) }; + unsafe { llamacpp::sampler_free(self.chain) }; } } } @@ -428,8 +428,8 @@ impl Drop for LlamacppSampler { struct LlamacppSeq { id: usize, batch_pos: usize, - token: bindings::llama_token, - pos: bindings::llama_pos, + token: llamacpp::llama_token, + pos: llamacpp::llama_pos, sampler: LlamacppSampler, text: String, n_new_tokens: usize, @@ -446,14 +446,14 @@ impl LlamacppBackend { // Setup llama & export logs, once and for all INIT.call_once(|| unsafe { - bindings::llama_log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); - bindings::llama_backend_init(); - bindings::llama_numa_init(match conf.numa { - LlamacppNuma::Disabled => bindings::GGML_NUMA_STRATEGY_DISABLED, - LlamacppNuma::Distribute => bindings::GGML_NUMA_STRATEGY_DISTRIBUTE, - LlamacppNuma::Isolate => bindings::GGML_NUMA_STRATEGY_ISOLATE, - LlamacppNuma::Numactl => bindings::GGML_NUMA_STRATEGY_NUMACTL, - LlamacppNuma::Mirror => bindings::GGML_NUMA_STRATEGY_MIRROR, + llamacpp::log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); + llamacpp::backend_init(); + llamacpp::numa_init(match conf.numa { + LlamacppNuma::Disabled => llamacpp::GGML_NUMA_STRATEGY_DISABLED, + LlamacppNuma::Distribute => llamacpp::GGML_NUMA_STRATEGY_DISTRIBUTE, + LlamacppNuma::Isolate => llamacpp::GGML_NUMA_STRATEGY_ISOLATE, + LlamacppNuma::Numactl => llamacpp::GGML_NUMA_STRATEGY_NUMACTL, + LlamacppNuma::Mirror => llamacpp::GGML_NUMA_STRATEGY_MIRROR, }); }); @@ -526,17 +526,17 @@ impl LlamacppBackend { for (pos, &token_id) in request.input_ids.iter().enumerate() { llamacpp.batch_push( - token_id as bindings::llama_token, - pos as bindings::llama_pos, - seq_id as bindings::llama_seq_id, + token_id as llamacpp::llama_token, + pos as llamacpp::llama_pos, + seq_id as llamacpp::llama_seq_id, pos == last_pos, // check samplers ); } seqs.push(LlamacppSeq { id: seq_id, batch_pos: llamacpp.batch.n_tokens as usize - 1, - token: bindings::LLAMA_TOKEN_NULL, - pos: last_pos as bindings::llama_pos + 1, + token: llamacpp::LLAMA_TOKEN_NULL, + pos: last_pos as llamacpp::llama_pos + 1, sampler: sampler, text: String::with_capacity(1024), n_new_tokens: 0, @@ -548,7 +548,7 @@ impl LlamacppBackend { break; } let decode = unsafe { - bindings::llama_decode(llamacpp.ctx, llamacpp.batch) + llamacpp::decode(llamacpp.ctx, llamacpp.batch) }; if decode != 0 { warn!("llama_decode failed, clearing kv cache"); @@ -560,7 +560,7 @@ impl LlamacppBackend { break; } let kv_cache_used_cells = unsafe { - bindings::llama_get_kv_cache_used_cells(llamacpp.ctx) + llamacpp::get_kv_cache_used_cells(llamacpp.ctx) }; for seq in seqs.iter_mut() { if !seq.running { @@ -591,7 +591,7 @@ impl LlamacppBackend { special: special, }; let finish: Option = { - if unsafe { bindings::llama_vocab_is_eog(llamacpp.vocab, next) } { + if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } { Some(FinishReason::EndOfSequenceToken) } else if seq.n_new_tokens == requests[seq.id].max_new_tokens { Some(FinishReason::Length) From 09a745f1b86d59324d1b389a9afcc71763a53187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 11:31:58 +0000 Subject: [PATCH 37/63] Remove n_ctx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 13 ++----------- backends/llamacpp/src/main.rs | 10 ---------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index aa44df31750..d81137e6cda 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -130,7 +130,6 @@ impl LlamacppGGMLType { pub struct LlamacppConfig { pub model_gguf: String, - pub n_ctx: usize, pub max_batch_total_tokens: usize, pub max_physical_batch_total_tokens: usize, pub max_batch_size: usize, @@ -206,7 +205,6 @@ struct Llamacpp { vocab: *const llamacpp::llama_vocab, logprobs: Vec, batch: llamacpp::llama_batch, - n_ctx: u32, } extern "C" fn llamacpp_log_callback( @@ -251,7 +249,7 @@ impl Llamacpp { } let ctx = unsafe { let mut params = llamacpp::context_default_params(); - params.n_ctx = conf.n_ctx as _; + params.n_ctx = conf.max_batch_total_tokens as _; params.n_batch = conf.max_batch_total_tokens as _; params.n_ubatch = conf.max_physical_batch_total_tokens as _; params.n_seq_max = conf.max_batch_size as _; @@ -268,8 +266,6 @@ impl Llamacpp { if ctx.is_null() { return Err(BackendError::Llamacpp("Failed to init context".to_string())) } - let n_ctx = unsafe { llamacpp::n_ctx(ctx) }; - let vocab = unsafe { llamacpp::model_get_vocab(model) }; @@ -291,7 +287,7 @@ impl Llamacpp { let batch = unsafe { llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) }; - Ok(Llamacpp{model, ctx, vocab, logprobs, n_ctx, batch}) + Ok(Llamacpp{model, ctx, vocab, logprobs, batch}) } fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) { @@ -559,9 +555,6 @@ impl LlamacppBackend { } break; } - let kv_cache_used_cells = unsafe { - llamacpp::get_kv_cache_used_cells(llamacpp.ctx) - }; for seq in seqs.iter_mut() { if !seq.running { continue; @@ -595,8 +588,6 @@ impl LlamacppBackend { Some(FinishReason::EndOfSequenceToken) } else if seq.n_new_tokens == requests[seq.id].max_new_tokens { Some(FinishReason::Length) - } else if kv_cache_used_cells == llamacpp.n_ctx as i32 { - Some(FinishReason::Length) // TODO: check } else { None } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 5548773bf6a..310ca8f1c80 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -24,10 +24,6 @@ struct Args { #[clap(long, env)] model_gguf: String, // TODO Option() with hf->gguf & quantize - /// Context size for the model. - #[clap(default_value = "4096", long, env)] - n_ctx: usize, - /// Number of threads to use for generation. #[clap(long, env)] n_threads: Option, @@ -198,11 +194,6 @@ async fn main() -> Result<(), RouterError> { "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); } - if args.max_batch_total_tokens > args.n_ctx { - return Err(RouterError::ArgumentValidation( - "`max_batch_total_tokens` must be <= `n_ctx`".to_string(), - )); - } // TODO: check if we use the same cache of Server // check if llamacpp is faster @@ -224,7 +215,6 @@ async fn main() -> Result<(), RouterError> { let (backend, ok, shutdown) = LlamacppBackend::new( LlamacppConfig { model_gguf: args.model_gguf, - n_ctx: args.n_ctx, n_threads: n_threads, n_threads_batch: n_threads_batch, n_gpu_layers: args.n_gpu_layers, From 5b777877b161ae28b3694a04be6b7ace21262321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 11:40:20 +0000 Subject: [PATCH 38/63] Make max_batch_total_tokens optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 310ca8f1c80..d30d7f82b8b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -89,8 +89,8 @@ struct Args { max_total_tokens: usize, /// Maximum number of tokens in a batch. - #[clap(default_value = "4096", long, env)] - max_batch_total_tokens: usize, + #[clap(long, env)] + max_batch_total_tokens: Option, /// Maximum number of tokens in a physical batch. #[clap(long, env)] @@ -175,8 +175,12 @@ async fn main() -> Result<(), RouterError> { Some(0) | None => n_threads, Some(threads) => threads, }; + let max_batch_total_tokens = match args.max_batch_total_tokens { + None => args.max_batch_size * args.max_total_tokens, + Some(size) => size, + }; let max_physical_batch_total_tokens = match args.max_physical_batch_total_tokens { - None => args.max_batch_total_tokens, + None => max_batch_total_tokens, Some(size) => size, }; if args.max_input_tokens >= args.max_total_tokens { @@ -184,12 +188,12 @@ async fn main() -> Result<(), RouterError> { "`max_input_tokens` must be < `max_total_tokens`".to_string(), )); } - if args.max_total_tokens > args.max_batch_total_tokens { + if args.max_total_tokens > max_batch_total_tokens { return Err(RouterError::ArgumentValidation( "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); } - if args.max_batch_size * args.max_total_tokens > args.max_batch_total_tokens { + if args.max_batch_size * args.max_total_tokens > max_batch_total_tokens { return Err(RouterError::ArgumentValidation( "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); @@ -227,7 +231,7 @@ async fn main() -> Result<(), RouterError> { type_k: args.type_k, type_v: args.type_v, offload_kqv: args.offload_kqv, - max_batch_total_tokens: args.max_batch_total_tokens, + max_batch_total_tokens: max_batch_total_tokens, max_physical_batch_total_tokens: max_physical_batch_total_tokens, max_batch_size: args.max_batch_size, batch_timeout: tokio::time::Duration::from_millis(5), From 695b1292e95ae70b9db228676f073e11e8ec711e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 15:42:59 +0000 Subject: [PATCH 39/63] Ensure all samplers are freed on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index d81137e6cda..fa0e7beb765 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -365,14 +365,17 @@ impl LlamacppSampler { let dist = unsafe { llamacpp::sampler_init_dist(req.seed) }; + let all = &[ + ("top_k", top_k), + ("top_p", top_p), + ("typical_p", typical_p), + ("temp", temp), + ("penalties", penalties), + ("dist", dist), + ]; let mut failed = false; - for (k, v) in &[( "top_k", top_k ), - ( "top_p", top_p ), - ("typical_p", typical_p), - ( "temp", temp ), - ("penalties", penalties), - ( "dist", dist )] { + for (k, v) in all { if v.is_null() { error!("Failed to init {k} sampler"); failed = true; @@ -381,6 +384,7 @@ impl LlamacppSampler { } } if failed { + unsafe { llamacpp::sampler_free(chain) }; None } else { Some(LlamacppSampler{chain}) From 0f62401b8eb6d9c150426313b5a2a4fc654b809a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 15:44:46 +0000 Subject: [PATCH 40/63] Initialize penalty_last_n with llamacpp default value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index fa0e7beb765..1072dec42d5 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -185,7 +185,7 @@ impl LlamacppRequest { min_keep: 0, // disabled temp: from.parameters.temperature as _, seed: from.parameters.seed as _, - penalty_last_n: -1, // 0 = disabled, -1 = context size + penalty_last_n: 64, // 0 = disabled, -1 = context size penalty_repeat: from.parameters.repetition_penalty as _, penalty_freq: from.parameters.frequency_penalty as _, penalty_present: 0.0, // disabled From f22e2fb5509fb639cdecd6ca98345789d8fe35eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 16:12:34 +0000 Subject: [PATCH 41/63] Cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 1072dec42d5..bed7d2bde7a 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -290,6 +290,12 @@ impl Llamacpp { Ok(Llamacpp{model, ctx, vocab, logprobs, batch}) } + fn decode(&mut self) -> i32 { + unsafe { + llamacpp::decode(self.ctx, self.batch) + } + } + fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) { unsafe { llamacpp::kv_cache_seq_rm(self.ctx, seq_id, -1, -1); @@ -543,14 +549,8 @@ impl LlamacppBackend { running: true, }); } - loop { - if llamacpp.batch.n_tokens == 0 { - break; - } - let decode = unsafe { - llamacpp::decode(llamacpp.ctx, llamacpp.batch) - }; - if decode != 0 { + while llamacpp.batch.n_tokens > 0 { + if llamacpp.decode() != 0 { warn!("llama_decode failed, clearing kv cache"); llamacpp.clear_kv_cache(-1); for seq in seqs.iter_mut() { From b3e40c4b66da359451599f00d51a0eb55f181609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 16:38:52 +0000 Subject: [PATCH 42/63] Improve default settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index d30d7f82b8b..47f33430817 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -77,8 +77,8 @@ struct Args { validation_workers: usize, /// Maximum amount of concurrent requests. - #[clap(default_value = "128", long, env)] - max_concurrent_requests: usize, + #[clap(long, env)] + max_concurrent_requests: Option, /// Maximum number of input tokens per request. #[clap(default_value = "1024", long, env)] @@ -97,8 +97,8 @@ struct Args { max_physical_batch_total_tokens: Option, /// Maximum number of requests per batch. - #[clap(default_value = "1", long, env)] - max_batch_size: usize, + #[clap(long, env)] + max_batch_size: Option, /// IP address to listen on. #[clap(default_value = "0.0.0.0", long, env)] @@ -175,14 +175,22 @@ async fn main() -> Result<(), RouterError> { Some(0) | None => n_threads, Some(threads) => threads, }; + let max_batch_size = match args.max_batch_size { + Some(0) | None => n_threads_batch, + Some(threads) => threads, + }; let max_batch_total_tokens = match args.max_batch_total_tokens { - None => args.max_batch_size * args.max_total_tokens, + None => max_batch_size * args.max_total_tokens, Some(size) => size, }; let max_physical_batch_total_tokens = match args.max_physical_batch_total_tokens { None => max_batch_total_tokens, Some(size) => size, }; + let max_concurrent_requests = match args.max_concurrent_requests { + None => max_batch_size * 2, + Some(size) => size, + }; if args.max_input_tokens >= args.max_total_tokens { return Err(RouterError::ArgumentValidation( "`max_input_tokens` must be < `max_total_tokens`".to_string(), @@ -193,7 +201,7 @@ async fn main() -> Result<(), RouterError> { "`max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); } - if args.max_batch_size * args.max_total_tokens > max_batch_total_tokens { + if max_batch_size * args.max_total_tokens > max_batch_total_tokens { return Err(RouterError::ArgumentValidation( "`max_batch_size` * `max_total_tokens` must be <= `max_batch_total_tokens`".to_string(), )); @@ -233,7 +241,7 @@ async fn main() -> Result<(), RouterError> { offload_kqv: args.offload_kqv, max_batch_total_tokens: max_batch_total_tokens, max_physical_batch_total_tokens: max_physical_batch_total_tokens, - max_batch_size: args.max_batch_size, + max_batch_size: max_batch_size, batch_timeout: tokio::time::Duration::from_millis(5), }, tokenizer, @@ -250,7 +258,7 @@ async fn main() -> Result<(), RouterError> { server::run( backend, - args.max_concurrent_requests, + max_concurrent_requests, 0, // max_best_of 0, // max_stop_sequences 0, // max_top_n_tokens From 1641c22af8e7fb264dc0dbc22ae965925b981acd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Wed, 5 Feb 2025 21:14:30 +0000 Subject: [PATCH 43/63] Add doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 15 +++--- backends/llamacpp/src/main.rs | 2 +- docs/source/backends/llamacpp.md | 92 ++++++++++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 8 deletions(-) create mode 100644 docs/source/backends/llamacpp.md diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index b020778f6e0..67fb82b5269 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,7 +1,8 @@ FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS deps -ARG llama_version=b4628 -ARG llama_cuda_arch=75-real;80-real;86-real;89-real;90-real +ARG llamacpp_version=b4628 +ARG llamacpp_cuda=OFF +ARG cuda_arch=75-real;80-real;86-real;89-real;90-real ENV TGI_LLAMA_PKG_CUDA=cuda-${CUDA_VERSION%.*} WORKDIR /opt/src @@ -17,16 +18,16 @@ RUN apt update && apt install -y \ pkg-config \ tar -ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llama_version}.tar.gz /opt/src/ -RUN tar -xzf ${llama_version}.tar.gz \ - && cd llama.cpp-${llama_version} \ +ADD https://github.com/ggerganov/llama.cpp/archive/refs/tags/${llamacpp_version}.tar.gz /opt/src/ +RUN tar -xzf ${llamacpp_version}.tar.gz \ + && cd llama.cpp-${llamacpp_version} \ && cmake -B build \ -DCMAKE_INSTALL_PREFIX=/usr \ -DCMAKE_INSTALL_LIBDIR=/usr/lib \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_CUDA_ARCHITECTURES=${llama_cuda_arch} \ - -DGGML_CUDA=1 \ + -DCMAKE_CUDA_ARCHITECTURES=${cuda_arch} \ + -DGGML_CUDA=${llamacpp_cuda} \ -DLLAMA_BUILD_COMMON=OFF \ -DLLAMA_BUILD_TESTS=OFF \ -DLLAMA_BUILD_EXAMPLES=OFF \ diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 47f33430817..5512c59e802 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -105,7 +105,7 @@ struct Args { hostname: String, /// Port to listen on. - #[clap(default_value = "3001", long, short, env)] + #[clap(default_value = "3000", long, short, env)] port: u16, /// Enable JSON output format. diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md new file mode 100644 index 00000000000..86e0f0ebd15 --- /dev/null +++ b/docs/source/backends/llamacpp.md @@ -0,0 +1,92 @@ +# Llamacpp backend + +The llamacpp backend is a backend for running LLMs using the `llama.cpp` +project. It supports CPU and GPU inference and is easy to deploy without +complex dependencies. For more details, visit the official repository: +[llama.cpp](https://github.com/ggerganov/llama.cpp). + +## Supported models + +`llama.cpp` uses the GGUF format, which supports various quantization +levels to optimize performance and reduce memory usage. Learn more and +find GGUF models on [Hugging Face](https://huggingface.co/models?search=gguf). + +## Building the Docker image + +The llamacpp backend is optimized for the local machine, so it is highly +recommended to build the Docker image on the same machine where it will +be used for inference. You can build it directly from the GitHub +repository without cloning using the following command: + +```bash +docker build \ + -t llamacpp-backend \ + https://github.com/huggingface/text-generation-inference.git \ + -f Dockerfile_llamacpp +``` + +### Build arguments + +You can customize the build using the following arguments: + +| Argument | Description | +|----------------------------------------|----------------------------------------------| +| `--build-arg llamacpp_version=VERSION` | Specifies a particular version of llama.cpp. | +| `--build-arg llamacpp_cuda=ON` | Enables CUDA support. | +| `--build-arg cuda_arch=ARCH` | Selects the target GPU architecture. | + +## Preparing the model + +Before running TGI, you need a GGUF model, for example: + +```bash +mkdir -p ~/models +cd ~/models +curl -O "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true" +``` + +## Running the llamacpp backend + +Run TGI with the llamacpp backend and your chosen model. When using GPU +inference, you need to set `--gpus`, like `--gpus all` for example. Below is +an example for CPU-only inference: + +```bash +docker run \ + -p 3000:3000 \ + -e "HF_TOKEN=$HF_TOKEN" \ + -v "$HOME/models:/models" \ + llamacpp-backend \ + --model-id "Qwen/Qwen2.5-3B-Instruct" \ + --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf" +``` + +This will start the server and expose the API on port 3000. + +## Configuration options + +The llamacpp backend provides various options to optimize performance: + +| Argument | Description | +|---------------------------------------|------------------------------------------------------------------------| +| `--n-threads N` | Number of threads to use for generation | +| `--n-threads-batch N` | Number of threads to use for batch processing | +| `--n-gpu-layers N` | Number of layers to store in VRAM | +| `--split-mode MODE` | Split the model across multiple GPUs | +| `--defrag-threshold FLOAT` | Defragment the KV cache if holes/size > threshold | +| `--numa MODE` | Enable NUMA optimizations | +| `--use-mmap` | Use memory mapping for the model | +| `--use-mlock` | Use memory locking to prevent swapping | +| `--offload-kqv` | Enable offloading of KQV operations to the GPU | +| `--flash-attention` | Enable flash attention for faster inference. (EXPERIMENTAL) | +| `--type-k TYPE` | Data type used for K cache | +| `--type-v TYPE` | Data type used for V cache | +| `--validation-workers N` | Number of tokenizer workers used for payload validation and truncation | +| `--max-concurrent-requests N` | Maximum amount of concurrent requests | +| `--max-input-tokens N` | Maximum number of input tokens per request | +| `--max-total-tokens N` | Maximum total tokens (input + output) per request | +| `--max-batch-total-tokens N` | Maximum number of tokens in a batch | +| `--max-physical-batch-total-tokens N` | Maximum number of tokens in a physical batch | +| `--max-batch-size N` | Maximum number of requests per batch | + +You can also run the docker with `--help` for more information. From e4d5fa7eafb7e4cd08cab70ce60091a8ce78ef19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 09:46:24 +0000 Subject: [PATCH 44/63] Update docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 6 +- docs/source/backends/llamacpp.md | 144 ++++++++++++++++++------------- 2 files changed, 88 insertions(+), 62 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 5512c59e802..df15189b272 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -76,7 +76,7 @@ struct Args { #[clap(default_value = "2", long, env)] validation_workers: usize, - /// Maximum amount of concurrent requests. + /// Maximum number of concurrent requests. #[clap(long, env)] max_concurrent_requests: Option, @@ -84,7 +84,7 @@ struct Args { #[clap(default_value = "1024", long, env)] max_input_tokens: usize, - /// Maximum total tokens (input + output) per request. + /// Maximum number of total tokens (input + output) per request. #[clap(default_value = "2048", long, env)] max_total_tokens: usize, @@ -152,7 +152,7 @@ struct Args { #[clap(default_value = "on", long, env)] usage_stats: usage_stats::UsageStatsLevel, - /// Maximum payload size limit in bytes. + /// Maximum payload size in bytes. #[clap(default_value = "2000000", long, env)] payload_limit: usize, } diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md index 86e0f0ebd15..f5aeb52c06f 100644 --- a/docs/source/backends/llamacpp.md +++ b/docs/source/backends/llamacpp.md @@ -1,43 +1,52 @@ -# Llamacpp backend +# Llamacpp Backend -The llamacpp backend is a backend for running LLMs using the `llama.cpp` -project. It supports CPU and GPU inference and is easy to deploy without -complex dependencies. For more details, visit the official repository: -[llama.cpp](https://github.com/ggerganov/llama.cpp). +The llamacpp backend facilitates the deployment of large language models +(LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine +optimized for both CPU and GPU computation. This backend is a component +of Hugging Face’s **Text Generation Inference (TGI)** suite, +specifically designed to streamline the deployment of LLMs in production +environments. -## Supported models +## Key Capabilities -`llama.cpp` uses the GGUF format, which supports various quantization -levels to optimize performance and reduce memory usage. Learn more and -find GGUF models on [Hugging Face](https://huggingface.co/models?search=gguf). +- Full compatibility with GGUF format and all quantization formats + (GGUF-related constraints may be mitigated dynamically by on-the-fly + generation in future updates) +- Optimized inference on CPU and GPU architectures +- Containerized deployment, eliminating dependency complexity +- Seamless interoperability with the Hugging Face ecosystem -## Building the Docker image +## Model Compatibility -The llamacpp backend is optimized for the local machine, so it is highly -recommended to build the Docker image on the same machine where it will -be used for inference. You can build it directly from the GitHub -repository without cloning using the following command: +This backend leverages models formatted in **GGUF**, providing an +optimized balance between computational efficiency and model accuracy. +You will find the best models on [Hugging Face][GGUF]. + +## Build Docker image + +For optimal performance, the Docker image is compiled with native CPU +instructions, thus it's highly recommended to execute the container on +the host used during the build process. Efforts are ongoing to enhance +portability while maintaining high computational efficiency. ```bash docker build \ - -t llamacpp-backend \ + -t tgi-llamacpp \ https://github.com/huggingface/text-generation-inference.git \ -f Dockerfile_llamacpp ``` -### Build arguments - -You can customize the build using the following arguments: +### Build parameters -| Argument | Description | -|----------------------------------------|----------------------------------------------| -| `--build-arg llamacpp_version=VERSION` | Specifies a particular version of llama.cpp. | -| `--build-arg llamacpp_cuda=ON` | Enables CUDA support. | -| `--build-arg cuda_arch=ARCH` | Selects the target GPU architecture. | +| Parameter | Description | +| ------------------------------------ | --------------------------------- | +| `--build-arg llamacpp_version=bXXXX` | Specific version of llama.cpp | +| `--build-arg llamacpp_cuda=ON` | Enables CUDA acceleration | +| `--build-arg cuda_arch=ARCH` | Defines target CUDA architecture | -## Preparing the model +## Model preparation -Before running TGI, you need a GGUF model, for example: +Retrieve a GGUF model and store it in a specific directory, for example: ```bash mkdir -p ~/models @@ -45,48 +54,65 @@ cd ~/models curl -O "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true" ``` -## Running the llamacpp backend +## Run Docker image -Run TGI with the llamacpp backend and your chosen model. When using GPU -inference, you need to set `--gpus`, like `--gpus all` for example. Below is -an example for CPU-only inference: +### CPU-based inference ```bash docker run \ -p 3000:3000 \ -e "HF_TOKEN=$HF_TOKEN" \ -v "$HOME/models:/models" \ - llamacpp-backend \ + tgi-llamacpp \ --model-id "Qwen/Qwen2.5-3B-Instruct" \ --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf" ``` -This will start the server and expose the API on port 3000. - -## Configuration options - -The llamacpp backend provides various options to optimize performance: - -| Argument | Description | -|---------------------------------------|------------------------------------------------------------------------| -| `--n-threads N` | Number of threads to use for generation | -| `--n-threads-batch N` | Number of threads to use for batch processing | -| `--n-gpu-layers N` | Number of layers to store in VRAM | -| `--split-mode MODE` | Split the model across multiple GPUs | -| `--defrag-threshold FLOAT` | Defragment the KV cache if holes/size > threshold | -| `--numa MODE` | Enable NUMA optimizations | -| `--use-mmap` | Use memory mapping for the model | -| `--use-mlock` | Use memory locking to prevent swapping | -| `--offload-kqv` | Enable offloading of KQV operations to the GPU | -| `--flash-attention` | Enable flash attention for faster inference. (EXPERIMENTAL) | -| `--type-k TYPE` | Data type used for K cache | -| `--type-v TYPE` | Data type used for V cache | -| `--validation-workers N` | Number of tokenizer workers used for payload validation and truncation | -| `--max-concurrent-requests N` | Maximum amount of concurrent requests | -| `--max-input-tokens N` | Maximum number of input tokens per request | -| `--max-total-tokens N` | Maximum total tokens (input + output) per request | -| `--max-batch-total-tokens N` | Maximum number of tokens in a batch | -| `--max-physical-batch-total-tokens N` | Maximum number of tokens in a physical batch | -| `--max-batch-size N` | Maximum number of requests per batch | - -You can also run the docker with `--help` for more information. +### GPU-Accelerated inference + +```bash +docker run \ + --gpus all \ + -p 3000:3000 \ + -e "HF_TOKEN=$HF_TOKEN" \ + -v "$HOME/models:/models" \ + tgi-llamacpp \ + --n-gpu-layers 99 + --model-id "Qwen/Qwen2.5-3B-Instruct" \ + --model-gguf "/models/qwen2.5-3b-instruct-q4_0.gguf" +``` + +## Advanced parameters + +A full listing of configurable parameters is available in the `--help`: + +```bash +docker run tgi-llamacpp --help + +``` + +The table below summarizes key options: + +| Parameter | Description | +|-------------------------------------|------------------------------------------------------------------------| +| `--n-threads` | Number of threads to use for generation | +| `--n-threads-batch` | Number of threads to use for batch processing | +| `--n-gpu-layers` | Number of layers to store in VRAM | +| `--split-mode` | Split the model across multiple GPUs | +| `--defrag-threshold` | Defragment the KV cache if holes/size > threshold | +| `--numa` | Enable NUMA optimizations | +| `--use-mlock` | Use memory locking to prevent swapping | +| `--offload-kqv` | Enable offloading of KQV operations to the GPU | +| `--type-k` | Data type used for K cache | +| `--type-v` | Data type used for V cache | +| `--validation-workers` | Number of tokenizer workers used for payload validation and truncation | +| `--max-concurrent-requests` | Maximum number of concurrent requests | +| `--max-input-tokens` | Maximum number of input tokens per request | +| `--max-total-tokens` | Maximum number of total tokens (input + output) per request | +| `--max-batch-total-tokens` | Maximum number of tokens in a batch | +| `--max-physical-batch-total-tokens` | Maximum number of tokens in a physical batch | +| `--max-batch-size` | Maximum number of requests per batch | + +--- +[llama.cpp]: https://github.com/ggerganov/llama.cpp +[GGUF]: https://huggingface.co/models?library=gguf&sort=trending From fb81c0d1c479d39454b397cd2a254eb77760bfed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 10:53:57 +0100 Subject: [PATCH 45/63] Thanks clippy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 28 ++++++++++++---------------- backends/llamacpp/src/main.rs | 12 ++++++------ 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index bed7d2bde7a..5d5eab43cb7 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -38,7 +38,7 @@ impl FromStr for LlamacppSplitMode { "row" => Ok(LlamacppSplitMode::Row), _ => match s.parse::() { Ok(n) => Ok(LlamacppSplitMode::GPU(n)), - Err(_) => Err(format!("Choose a GPU number or `layer` or `row`")), + Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()), } } } @@ -176,8 +176,7 @@ impl LlamacppRequest { from: &ValidGenerateRequest, tx: UnboundedSender>, ) -> Option{ - if let Some(input_ids) = from.input_ids.as_ref() { - Some(LlamacppRequest { + from.input_ids.as_ref().map(|input_ids| LlamacppRequest { input_ids: input_ids.iter().map(|&x| x as i32).collect(), top_k: from.parameters.top_k as _, top_p: from.parameters.top_p as _, @@ -190,12 +189,9 @@ impl LlamacppRequest { penalty_freq: from.parameters.frequency_penalty as _, penalty_present: 0.0, // disabled max_new_tokens: from.stopping_parameters.max_new_tokens as _, - tx: tx, + tx, time: Instant::now(), }) - } else { - None - } } } @@ -404,7 +400,7 @@ impl LlamacppSampler { for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() { *logprob = llamacpp::llama_token_data { id: token as _, - logit: unsafe { *logits.offset(token as _) }, + logit: unsafe { *logits.add(token) }, p: 0.0, }; } @@ -484,7 +480,7 @@ impl LlamacppBackend { Ok(Some(request)) => { let n_tokens_to_add = request.input_ids.len(); - if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens as usize { + if n_tokens + n_tokens_to_add > conf.max_batch_total_tokens { flush(&mut requests, &mut n_tokens); } n_tokens += n_tokens_to_add; @@ -511,7 +507,7 @@ impl LlamacppBackend { let _ = status_tx.send(true); while let Ok(requests) = sync_rx.recv() { - if shutdown_rx.borrow().clone() { + if *shutdown_rx.borrow() { break; } let start_time = Instant::now(); @@ -521,7 +517,7 @@ impl LlamacppBackend { for (seq_id, request) in requests.iter().enumerate() { debug!("Request: {:?}", request); // TODO remove this - let sampler = match LlamacppSampler::new(&request) { + let sampler = match LlamacppSampler::new(request) { Some(sampler) => sampler, _ => { let _ = request.tx.send(Err(InferError::IncompleteGeneration)); @@ -543,7 +539,7 @@ impl LlamacppBackend { batch_pos: llamacpp.batch.n_tokens as usize - 1, token: llamacpp::LLAMA_TOKEN_NULL, pos: last_pos as llamacpp::llama_pos + 1, - sampler: sampler, + sampler, text: String::with_capacity(1024), n_new_tokens: 0, running: true, @@ -584,8 +580,8 @@ impl LlamacppBackend { let token = Token { id: next as _, text: piece, - logprob: logprob, - special: special, + logprob, + special, }; let finish: Option = { if unsafe { llamacpp::vocab_is_eog(llamacpp.vocab, next) } { @@ -598,7 +594,7 @@ impl LlamacppBackend { }; if let Some(reason) = finish { let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::End { - token: token, + token, top_tokens: vec![], generated_text: GeneratedText { text: seq.text.clone(), @@ -613,7 +609,7 @@ impl LlamacppBackend { continue; } let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate { - token: token, + token, top_tokens: vec![], })); } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index df15189b272..762764a709f 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -215,7 +215,7 @@ async fn main() -> Result<(), RouterError> { .ok(); let params = FromPretrainedParameters { revision: args.revision.clone(), - token: token, + token, ..Default::default() }; Tokenizer::from_pretrained( @@ -227,8 +227,8 @@ async fn main() -> Result<(), RouterError> { let (backend, ok, shutdown) = LlamacppBackend::new( LlamacppConfig { model_gguf: args.model_gguf, - n_threads: n_threads, - n_threads_batch: n_threads_batch, + n_threads, + n_threads_batch, n_gpu_layers: args.n_gpu_layers, split_mode: args.split_mode, defrag_threshold: args.defrag_threshold, @@ -239,9 +239,9 @@ async fn main() -> Result<(), RouterError> { type_k: args.type_k, type_v: args.type_v, offload_kqv: args.offload_kqv, - max_batch_total_tokens: max_batch_total_tokens, - max_physical_batch_total_tokens: max_physical_batch_total_tokens, - max_batch_size: max_batch_size, + max_batch_total_tokens, + max_physical_batch_total_tokens, + max_batch_size, batch_timeout: tokio::time::Duration::from_millis(5), }, tokenizer, From 2b0d99c1cf7b7af3cd2590387a2aa11b6e42bb44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 10:08:18 +0000 Subject: [PATCH 46/63] Thanks cargo fmt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/build.rs | 3 +- backends/llamacpp/src/backend.rs | 75 ++++++++++++++++++-------------- backends/llamacpp/src/main.rs | 22 ++++------ 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 1b1c3718f09..aa2a0d8716b 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -1,5 +1,4 @@ - -use bindgen::callbacks::{ParseCallbacks, ItemInfo}; +use bindgen::callbacks::{ItemInfo, ParseCallbacks}; use std::collections::HashMap; use std::env; use std::path::PathBuf; diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 5d5eab43cb7..81f7b9f42a4 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -7,21 +7,21 @@ mod llamacpp { } use async_trait::async_trait; use std::ffi::CString; +use std::mem::replace; +use std::str::FromStr; use std::sync::{mpsc, Once}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; -use text_generation_router::validation::{ValidGenerateRequest}; +use text_generation_router::validation::ValidGenerateRequest; use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::sync::{watch, oneshot}; +use tokio::sync::{oneshot, watch}; use tokio::task::{spawn, spawn_blocking}; -use tokio::time::{Duration, Instant, timeout}; +use tokio::time::{timeout, Duration, Instant}; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, info, warn, error, trace}; -use tracing::{instrument}; -use std::str::FromStr; -use std::mem::replace; +use tracing::instrument; +use tracing::{debug, error, info, trace, warn}; #[derive(Debug, Clone, Copy)] pub enum LlamacppSplitMode { @@ -39,7 +39,7 @@ impl FromStr for LlamacppSplitMode { _ => match s.parse::() { Ok(n) => Ok(LlamacppSplitMode::GPU(n)), Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()), - } + }, } } } @@ -175,23 +175,23 @@ impl LlamacppRequest { fn new( from: &ValidGenerateRequest, tx: UnboundedSender>, - ) -> Option{ + ) -> Option { from.input_ids.as_ref().map(|input_ids| LlamacppRequest { - input_ids: input_ids.iter().map(|&x| x as i32).collect(), - top_k: from.parameters.top_k as _, - top_p: from.parameters.top_p as _, - typical_p: from.parameters.typical_p as _, - min_keep: 0, // disabled - temp: from.parameters.temperature as _, - seed: from.parameters.seed as _, - penalty_last_n: 64, // 0 = disabled, -1 = context size - penalty_repeat: from.parameters.repetition_penalty as _, - penalty_freq: from.parameters.frequency_penalty as _, - penalty_present: 0.0, // disabled - max_new_tokens: from.stopping_parameters.max_new_tokens as _, - tx, - time: Instant::now(), - }) + input_ids: input_ids.iter().map(|&x| x as i32).collect(), + top_k: from.parameters.top_k as _, + top_p: from.parameters.top_p as _, + typical_p: from.parameters.typical_p as _, + min_keep: 0, // disabled + temp: from.parameters.temperature as _, + seed: from.parameters.seed as _, + penalty_last_n: 64, // 0 = disabled, -1 = context size + penalty_repeat: from.parameters.repetition_penalty as _, + penalty_freq: from.parameters.frequency_penalty as _, + penalty_present: 0.0, // disabled + max_new_tokens: from.stopping_parameters.max_new_tokens as _, + tx, + time: Instant::now(), + }) } } @@ -241,7 +241,7 @@ impl Llamacpp { llamacpp::model_load_from_file(gguf.as_ptr(), params) }; if model.is_null() { - return Err(BackendError::Llamacpp("Failed to load model".to_string())) + return Err(BackendError::Llamacpp("Failed to load model".to_string())); } let ctx = unsafe { let mut params = llamacpp::context_default_params(); @@ -260,7 +260,7 @@ impl Llamacpp { llamacpp::init_from_model(model, params) }; if ctx.is_null() { - return Err(BackendError::Llamacpp("Failed to init context".to_string())) + return Err(BackendError::Llamacpp("Failed to init context".to_string())); } let vocab = unsafe { llamacpp::model_get_vocab(model) @@ -444,8 +444,11 @@ impl LlamacppBackend { pub fn new( conf: LlamacppConfig, tokenizer: Tokenizer, - ) -> (Self, oneshot::Receiver>, watch::Sender) { - + ) -> ( + Self, + oneshot::Receiver>, + watch::Sender, + ) { // Setup llama & export logs, once and for all INIT.call_once(|| unsafe { llamacpp::log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); @@ -489,7 +492,7 @@ impl LlamacppBackend { if requests.len() == conf.max_batch_size { flush(&mut requests, &mut n_tokens); } - }, + } Ok(None) => break, // closed Err(_) => flush(&mut requests, &mut n_tokens), // timeout } @@ -498,8 +501,14 @@ impl LlamacppBackend { spawn_blocking(move || { let mut llamacpp = match Llamacpp::new(conf) { - Ok(v) => { let _ = ok_tx.send(Ok(())); v }, - Err(e) => { let _ = ok_tx.send(Err(e)); return; }, + Ok(v) => { + let _ = ok_tx.send(Ok(())); + v + } + Err(e) => { + let _ = ok_tx.send(Err(e)); + return; + } }; let vocab = tokenizer.get_added_vocabulary(); @@ -522,7 +531,7 @@ impl LlamacppBackend { _ => { let _ = request.tx.send(Err(InferError::IncompleteGeneration)); continue; - }, + } }; let last_pos = request.input_ids.len() - 1; @@ -570,7 +579,7 @@ impl LlamacppBackend { let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); seq.running = false; continue; - }, + } }; let special = vocab.is_special_token(&piece); diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 762764a709f..1919580d0dc 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,12 +1,15 @@ mod backend; -use backend::{LlamacppNuma, LlamacppGGMLType, LlamacppSplitMode, LlamacppConfig, LlamacppBackend, BackendError}; -use clap::{Parser}; +use backend::{ + BackendError, LlamacppBackend, LlamacppConfig, LlamacppGGMLType, LlamacppNuma, + LlamacppSplitMode, +}; +use clap::Parser; use text_generation_router::{logging, server, usage_stats}; use thiserror::Error; -use tokenizers::{Tokenizer, FromPretrainedParameters}; +use tokenizers::{FromPretrainedParameters, Tokenizer}; use tokio::sync::oneshot::error::RecvError; -use tracing::{warn, error}; +use tracing::{error, warn}; /// Backend Configuration #[derive(Parser, Debug)] @@ -161,11 +164,7 @@ struct Args { async fn main() -> Result<(), RouterError> { let args = Args::parse(); - logging::init_logging( - args.otlp_endpoint, - args.otlp_service_name, - args.json_output - ); + logging::init_logging(args.otlp_endpoint, args.otlp_service_name, args.json_output); let n_threads = match args.n_threads { Some(0) | None => num_cpus::get(), @@ -218,10 +217,7 @@ async fn main() -> Result<(), RouterError> { token, ..Default::default() }; - Tokenizer::from_pretrained( - args.model_id.clone(), - Some(params) - )? + Tokenizer::from_pretrained(args.model_id.clone(), Some(params))? }; let (backend, ok, shutdown) = LlamacppBackend::new( From 8bc10d37ee0613be5af9ac55af7c6c2ec7b8785a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 10:31:05 +0000 Subject: [PATCH 47/63] Update docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- docs/source/_toctree.yml | 2 ++ docs/source/multi_backend_support.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 8fcba516bd2..e073353fca9 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -52,6 +52,8 @@ - sections: - local: backends/trtllm title: TensorRT-LLM + - local: backends/llamacpp + title: Llamacpp title: Backends - sections: - local: reference/launcher diff --git a/docs/source/multi_backend_support.md b/docs/source/multi_backend_support.md index c4df15bc2ca..03d6d30be55 100644 --- a/docs/source/multi_backend_support.md +++ b/docs/source/multi_backend_support.md @@ -11,3 +11,5 @@ TGI remains consistent across backends, allowing you to switch between them seam * **[TGI TRTLLM backend](./backends/trtllm)**: This backend leverages NVIDIA's TensorRT library to accelerate LLM inference. It utilizes specialized optimizations and custom kernels for enhanced performance. However, it requires a model-specific compilation step for each GPU architecture. +* **[TGI Llamacpp backend](./backends/llamacpp)**: This backend facilitates the deployment of large language models + (LLMs) by integrating [llama.cpp][llama.cpp], an advanced inference engine optimized for both CPU and GPU computation. From 7bff88bba999576df0181b9919e88cdb13531465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 13:17:17 +0000 Subject: [PATCH 48/63] Do not use HOSTNAME env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 1919580d0dc..f9bf565111d 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -104,7 +104,7 @@ struct Args { max_batch_size: Option, /// IP address to listen on. - #[clap(default_value = "0.0.0.0", long, env)] + #[clap(default_value = "0.0.0.0", long)] hostname: String, /// Port to listen on. From df723e646b9eb73180f7632e464d7dbfd3b21a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 13:24:36 +0000 Subject: [PATCH 49/63] Bump llama.cpp & cuda MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 67fb82b5269..6fba85e9d12 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -1,6 +1,6 @@ -FROM nvidia/cuda:12.6.3-cudnn-devel-ubuntu24.04 AS deps +FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps -ARG llamacpp_version=b4628 +ARG llamacpp_version=b4651 ARG llamacpp_cuda=OFF ARG cuda_arch=75-real;80-real;86-real;89-real;90-real ENV TGI_LLAMA_PKG_CUDA=cuda-${CUDA_VERSION%.*} @@ -56,7 +56,7 @@ RUN cargo build \ --profile release-opt \ --package text-generation-router-llamacpp --frozen -FROM nvidia/cuda:12.6.3-cudnn-runtime-ubuntu24.04 +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu24.04 RUN apt update && apt install -y \ python3-venv \ From 5367d94f344489e0689c8d54e3d89d76fdd31559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 14:45:55 +0000 Subject: [PATCH 50/63] Fix requirements.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt index d7cff7bd0c1..cdce2da2002 100644 --- a/backends/llamacpp/requirements.txt +++ b/backends/llamacpp/requirements.txt @@ -1,2 +1,2 @@ transformers==4.48.2 -huggingface-hub==0.28.1 \ No newline at end of file +huggingface-hub==0.28.1 From 809e288b5a984c90b967cd827312b05f617e80fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 14:58:44 +0000 Subject: [PATCH 51/63] Fix fmt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 213 ++++++++++++++++--------------- backends/llamacpp/src/main.rs | 26 ++-- 2 files changed, 120 insertions(+), 119 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 81f7b9f42a4..dd873f6e5da 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -35,9 +35,9 @@ impl FromStr for LlamacppSplitMode { fn from_str(s: &str) -> Result { match s.to_lowercase().as_str() { "layer" => Ok(LlamacppSplitMode::Layer), - "row" => Ok(LlamacppSplitMode::Row), + "row" => Ok(LlamacppSplitMode::Row), _ => match s.parse::() { - Ok(n) => Ok(LlamacppSplitMode::GPU(n)), + Ok(n) => Ok(LlamacppSplitMode::GPU(n)), Err(_) => Err("Choose a GPU number or `layer` or `row`".to_string()), }, } @@ -93,37 +93,37 @@ pub enum LlamacppGGMLType { impl LlamacppGGMLType { fn to_ggml_type(&self) -> llamacpp::ggml_type { match self { - LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32, - LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16, - LlamacppGGMLType::Q4_0 => llamacpp::GGML_TYPE_Q4_0, - LlamacppGGMLType::Q4_1 => llamacpp::GGML_TYPE_Q4_1, - LlamacppGGMLType::Q5_0 => llamacpp::GGML_TYPE_Q5_0, - LlamacppGGMLType::Q5_1 => llamacpp::GGML_TYPE_Q5_1, - LlamacppGGMLType::Q8_0 => llamacpp::GGML_TYPE_Q8_0, - LlamacppGGMLType::Q8_1 => llamacpp::GGML_TYPE_Q8_1, - LlamacppGGMLType::Q2_K => llamacpp::GGML_TYPE_Q2_K, - LlamacppGGMLType::Q3_K => llamacpp::GGML_TYPE_Q3_K, - LlamacppGGMLType::Q4_K => llamacpp::GGML_TYPE_Q4_K, - LlamacppGGMLType::Q5_K => llamacpp::GGML_TYPE_Q5_K, - LlamacppGGMLType::Q6_K => llamacpp::GGML_TYPE_Q6_K, - LlamacppGGMLType::Q8_K => llamacpp::GGML_TYPE_Q8_K, + LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32, + LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16, + LlamacppGGMLType::Q4_0 => llamacpp::GGML_TYPE_Q4_0, + LlamacppGGMLType::Q4_1 => llamacpp::GGML_TYPE_Q4_1, + LlamacppGGMLType::Q5_0 => llamacpp::GGML_TYPE_Q5_0, + LlamacppGGMLType::Q5_1 => llamacpp::GGML_TYPE_Q5_1, + LlamacppGGMLType::Q8_0 => llamacpp::GGML_TYPE_Q8_0, + LlamacppGGMLType::Q8_1 => llamacpp::GGML_TYPE_Q8_1, + LlamacppGGMLType::Q2_K => llamacpp::GGML_TYPE_Q2_K, + LlamacppGGMLType::Q3_K => llamacpp::GGML_TYPE_Q3_K, + LlamacppGGMLType::Q4_K => llamacpp::GGML_TYPE_Q4_K, + LlamacppGGMLType::Q5_K => llamacpp::GGML_TYPE_Q5_K, + LlamacppGGMLType::Q6_K => llamacpp::GGML_TYPE_Q6_K, + LlamacppGGMLType::Q8_K => llamacpp::GGML_TYPE_Q8_K, LlamacppGGMLType::IQ2_XXS => llamacpp::GGML_TYPE_IQ2_XXS, - LlamacppGGMLType::IQ2_XS => llamacpp::GGML_TYPE_IQ2_XS, + LlamacppGGMLType::IQ2_XS => llamacpp::GGML_TYPE_IQ2_XS, LlamacppGGMLType::IQ3_XXS => llamacpp::GGML_TYPE_IQ3_XXS, - LlamacppGGMLType::IQ1_S => llamacpp::GGML_TYPE_IQ1_S, - LlamacppGGMLType::IQ4_NL => llamacpp::GGML_TYPE_IQ4_NL, - LlamacppGGMLType::IQ3_S => llamacpp::GGML_TYPE_IQ3_S, - LlamacppGGMLType::IQ2_S => llamacpp::GGML_TYPE_IQ2_S, - LlamacppGGMLType::IQ4_XS => llamacpp::GGML_TYPE_IQ4_XS, - LlamacppGGMLType::I8 => llamacpp::GGML_TYPE_I8, - LlamacppGGMLType::I16 => llamacpp::GGML_TYPE_I16, - LlamacppGGMLType::I32 => llamacpp::GGML_TYPE_I32, - LlamacppGGMLType::I64 => llamacpp::GGML_TYPE_I64, - LlamacppGGMLType::F64 => llamacpp::GGML_TYPE_F64, - LlamacppGGMLType::IQ1_M => llamacpp::GGML_TYPE_IQ1_M, - LlamacppGGMLType::BF16 => llamacpp::GGML_TYPE_BF16, - LlamacppGGMLType::TQ1_0 => llamacpp::GGML_TYPE_TQ1_0, - LlamacppGGMLType::TQ2_0 => llamacpp::GGML_TYPE_TQ2_0, + LlamacppGGMLType::IQ1_S => llamacpp::GGML_TYPE_IQ1_S, + LlamacppGGMLType::IQ4_NL => llamacpp::GGML_TYPE_IQ4_NL, + LlamacppGGMLType::IQ3_S => llamacpp::GGML_TYPE_IQ3_S, + LlamacppGGMLType::IQ2_S => llamacpp::GGML_TYPE_IQ2_S, + LlamacppGGMLType::IQ4_XS => llamacpp::GGML_TYPE_IQ4_XS, + LlamacppGGMLType::I8 => llamacpp::GGML_TYPE_I8, + LlamacppGGMLType::I16 => llamacpp::GGML_TYPE_I16, + LlamacppGGMLType::I32 => llamacpp::GGML_TYPE_I32, + LlamacppGGMLType::I64 => llamacpp::GGML_TYPE_I64, + LlamacppGGMLType::F64 => llamacpp::GGML_TYPE_F64, + LlamacppGGMLType::IQ1_M => llamacpp::GGML_TYPE_IQ1_M, + LlamacppGGMLType::BF16 => llamacpp::GGML_TYPE_BF16, + LlamacppGGMLType::TQ1_0 => llamacpp::GGML_TYPE_TQ1_0, + LlamacppGGMLType::TQ2_0 => llamacpp::GGML_TYPE_TQ2_0, } } } @@ -177,18 +177,18 @@ impl LlamacppRequest { tx: UnboundedSender>, ) -> Option { from.input_ids.as_ref().map(|input_ids| LlamacppRequest { - input_ids: input_ids.iter().map(|&x| x as i32).collect(), - top_k: from.parameters.top_k as _, - top_p: from.parameters.top_p as _, - typical_p: from.parameters.typical_p as _, - min_keep: 0, // disabled - temp: from.parameters.temperature as _, - seed: from.parameters.seed as _, - penalty_last_n: 64, // 0 = disabled, -1 = context size - penalty_repeat: from.parameters.repetition_penalty as _, - penalty_freq: from.parameters.frequency_penalty as _, + input_ids: input_ids.iter().map(|&x| x as i32).collect(), + top_k: from.parameters.top_k as _, + top_p: from.parameters.top_p as _, + typical_p: from.parameters.typical_p as _, + min_keep: 0, // disabled + temp: from.parameters.temperature as _, + seed: from.parameters.seed as _, + penalty_last_n: 64, // 0 = disabled, -1 = context size + penalty_repeat: from.parameters.repetition_penalty as _, + penalty_freq: from.parameters.frequency_penalty as _, penalty_present: 0.0, // disabled - max_new_tokens: from.stopping_parameters.max_new_tokens as _, + max_new_tokens: from.stopping_parameters.max_new_tokens as _, tx, time: Instant::now(), }) @@ -213,10 +213,10 @@ extern "C" fn llamacpp_log_callback( match level { llamacpp::GGML_LOG_LEVEL_DEBUG => debug!(target: "llamacpp", "{}", rmsg), - llamacpp::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg), - llamacpp::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_INFO => info!(target: "llamacpp", "{}", rmsg), + llamacpp::GGML_LOG_LEVEL_WARN => warn!(target: "llamacpp", "{}", rmsg), llamacpp::GGML_LOG_LEVEL_ERROR => error!(target: "llamacpp", "{}", rmsg), - _ => trace!(target: "llamacpp", "{}", rmsg), + _ => trace!(target: "llamacpp", "{}", rmsg), } } @@ -229,14 +229,14 @@ impl Llamacpp { params.n_gpu_layers = conf.n_gpu_layers as _; params.split_mode = match conf.split_mode { LlamacppSplitMode::GPU(_) => llamacpp::LLAMA_SPLIT_MODE_NONE, - LlamacppSplitMode::Layer => llamacpp::LLAMA_SPLIT_MODE_LAYER, - LlamacppSplitMode::Row => llamacpp::LLAMA_SPLIT_MODE_ROW, + LlamacppSplitMode::Layer => llamacpp::LLAMA_SPLIT_MODE_LAYER, + LlamacppSplitMode::Row => llamacpp::LLAMA_SPLIT_MODE_ROW, }; params.main_gpu = match conf.split_mode { LlamacppSplitMode::GPU(n) => n as _, _ => 0, }; - params.use_mmap = conf.use_mmap; + params.use_mmap = conf.use_mmap; params.use_mlock = conf.use_mlock; llamacpp::model_load_from_file(gguf.as_ptr(), params) }; @@ -245,32 +245,28 @@ impl Llamacpp { } let ctx = unsafe { let mut params = llamacpp::context_default_params(); - params.n_ctx = conf.max_batch_total_tokens as _; - params.n_batch = conf.max_batch_total_tokens as _; - params.n_ubatch = conf.max_physical_batch_total_tokens as _; - params.n_seq_max = conf.max_batch_size as _; - params.n_threads = conf.n_threads as _; + params.n_ctx = conf.max_batch_total_tokens as _; + params.n_batch = conf.max_batch_total_tokens as _; + params.n_ubatch = conf.max_physical_batch_total_tokens as _; + params.n_seq_max = conf.max_batch_size as _; + params.n_threads = conf.n_threads as _; params.n_threads_batch = conf.n_threads_batch as _; - params.defrag_thold = conf.defrag_threshold; - params.offload_kqv = conf.offload_kqv; - params.flash_attn = conf.flash_attention; - params.type_k = conf.type_k.to_ggml_type(); - params.type_v = conf.type_v.to_ggml_type(); - params.no_perf = true; + params.defrag_thold = conf.defrag_threshold; + params.offload_kqv = conf.offload_kqv; + params.flash_attn = conf.flash_attention; + params.type_k = conf.type_k.to_ggml_type(); + params.type_v = conf.type_v.to_ggml_type(); + params.no_perf = true; llamacpp::init_from_model(model, params) }; if ctx.is_null() { return Err(BackendError::Llamacpp("Failed to init context".to_string())); } - let vocab = unsafe { - llamacpp::model_get_vocab(model) - }; + let vocab = unsafe { llamacpp::model_get_vocab(model) }; if vocab.is_null() { return Err(BackendError::Llamacpp("Failed to get vocab".to_string())); } - let n_tokens = unsafe { - llamacpp::vocab_n_tokens(vocab) - }; + let n_tokens = unsafe { llamacpp::vocab_n_tokens(vocab) }; let mut logprobs = Vec::with_capacity(n_tokens as usize); for token in 0..n_tokens { @@ -280,16 +276,18 @@ impl Llamacpp { p: 0.0, }); } - let batch = unsafe { - llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) - }; - Ok(Llamacpp{model, ctx, vocab, logprobs, batch}) + let batch = unsafe { llamacpp::batch_init(conf.max_batch_total_tokens as _, 0, 1) }; + Ok(Llamacpp { + model, + ctx, + vocab, + logprobs, + batch, + }) } fn decode(&mut self) -> i32 { - unsafe { - llamacpp::decode(self.ctx, self.batch) - } + unsafe { llamacpp::decode(self.ctx, self.batch) } } fn clear_kv_cache(&mut self, seq_id: llamacpp::llama_seq_id) { @@ -344,18 +342,10 @@ impl LlamacppSampler { error!("Failed to init sampler"); return None; } - let top_k = unsafe { - llamacpp::sampler_init_top_k(req.top_k) - }; - let top_p = unsafe { - llamacpp::sampler_init_top_p(req.top_p, req.min_keep) - }; - let typical_p = unsafe { - llamacpp::sampler_init_typical(req.typical_p, req.min_keep) - }; - let temp = unsafe { - llamacpp::sampler_init_temp(req.temp) - }; + let top_k = unsafe { llamacpp::sampler_init_top_k(req.top_k) }; + let top_p = unsafe { llamacpp::sampler_init_top_p(req.top_p, req.min_keep) }; + let typical_p = unsafe { llamacpp::sampler_init_typical(req.typical_p, req.min_keep) }; + let temp = unsafe { llamacpp::sampler_init_temp(req.temp) }; let penalties = unsafe { llamacpp::sampler_init_penalties( req.penalty_last_n, @@ -364,9 +354,7 @@ impl LlamacppSampler { req.penalty_present, ) }; - let dist = unsafe { - llamacpp::sampler_init_dist(req.seed) - }; + let dist = unsafe { llamacpp::sampler_init_dist(req.seed) }; let all = &[ ("top_k", top_k), ("top_p", top_p), @@ -389,14 +377,12 @@ impl LlamacppSampler { unsafe { llamacpp::sampler_free(chain) }; None } else { - Some(LlamacppSampler{chain}) + Some(LlamacppSampler { chain }) } } fn sample(&self, llamacpp: &mut Llamacpp, idx: usize) -> (llamacpp::llama_token, f32) { - let logits = unsafe { - llamacpp::get_logits_ith(llamacpp.ctx, idx as _) - }; + let logits = unsafe { llamacpp::get_logits_ith(llamacpp.ctx, idx as _) }; for (token, logprob) in llamacpp.logprobs.iter_mut().enumerate() { *logprob = llamacpp::llama_token_data { id: token as _, @@ -454,11 +440,11 @@ impl LlamacppBackend { llamacpp::log_set(Some(llamacpp_log_callback), std::ptr::null_mut()); llamacpp::backend_init(); llamacpp::numa_init(match conf.numa { - LlamacppNuma::Disabled => llamacpp::GGML_NUMA_STRATEGY_DISABLED, + LlamacppNuma::Disabled => llamacpp::GGML_NUMA_STRATEGY_DISABLED, LlamacppNuma::Distribute => llamacpp::GGML_NUMA_STRATEGY_DISTRIBUTE, - LlamacppNuma::Isolate => llamacpp::GGML_NUMA_STRATEGY_ISOLATE, - LlamacppNuma::Numactl => llamacpp::GGML_NUMA_STRATEGY_NUMACTL, - LlamacppNuma::Mirror => llamacpp::GGML_NUMA_STRATEGY_MIRROR, + LlamacppNuma::Isolate => llamacpp::GGML_NUMA_STRATEGY_ISOLATE, + LlamacppNuma::Numactl => llamacpp::GGML_NUMA_STRATEGY_NUMACTL, + LlamacppNuma::Mirror => llamacpp::GGML_NUMA_STRATEGY_MIRROR, }); }); @@ -474,7 +460,8 @@ impl LlamacppBackend { let flush = |requests: &mut Vec<_>, n_tokens: &mut usize| { if !requests.is_empty() { - let _ = sync_tx.send(replace(requests, Vec::with_capacity(conf.max_batch_size))); + let _ = + sync_tx.send(replace(requests, Vec::with_capacity(conf.max_batch_size))); *n_tokens = 0; } }; @@ -538,8 +525,8 @@ impl LlamacppBackend { for (pos, &token_id) in request.input_ids.iter().enumerate() { llamacpp.batch_push( token_id as llamacpp::llama_token, - pos as llamacpp::llama_pos, - seq_id as llamacpp::llama_seq_id, + pos as llamacpp::llama_pos, + seq_id as llamacpp::llama_seq_id, pos == last_pos, // check samplers ); } @@ -559,7 +546,9 @@ impl LlamacppBackend { warn!("llama_decode failed, clearing kv cache"); llamacpp.clear_kv_cache(-1); for seq in seqs.iter_mut() { - let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); + let _ = requests[seq.id] + .tx + .send(Err(InferError::IncompleteGeneration)); seq.running = false; } break; @@ -576,7 +565,9 @@ impl LlamacppBackend { Ok(piece) => piece, Err(e) => { error!("Failed to decode token: {e}"); - let _ = requests[seq.id].tx.send(Err(InferError::IncompleteGeneration)); + let _ = requests[seq.id] + .tx + .send(Err(InferError::IncompleteGeneration)); seq.running = false; continue; } @@ -617,17 +608,20 @@ impl LlamacppBackend { seq.running = false; continue; } - let _ = requests[seq.id].tx.send(Ok(InferStreamResponse::Intermediate { - token, - top_tokens: vec![], - })); + let _ = requests[seq.id] + .tx + .send(Ok(InferStreamResponse::Intermediate { + token, + top_tokens: vec![], + })); } // generate a new batch llamacpp.batch.n_tokens = 0; for seq in seqs.iter_mut() { if seq.running { - seq.batch_pos = llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); + seq.batch_pos = + llamacpp.batch_push(seq.token, seq.pos, seq.id as _, true); seq.pos += 1; } else { llamacpp.clear_kv_cache(seq.id as _); @@ -636,7 +630,14 @@ impl LlamacppBackend { } } }); - (Self{tx, status: status_rx}, ok_rx, shutdown_tx) + ( + Self { + tx, + status: status_rx, + }, + ok_rx, + shutdown_tx, + ) } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index f9bf565111d..753138f9919 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -222,23 +222,23 @@ async fn main() -> Result<(), RouterError> { let (backend, ok, shutdown) = LlamacppBackend::new( LlamacppConfig { - model_gguf: args.model_gguf, + model_gguf: args.model_gguf, n_threads, n_threads_batch, - n_gpu_layers: args.n_gpu_layers, - split_mode: args.split_mode, - defrag_threshold: args.defrag_threshold, - numa: args.numa, - use_mmap: args.use_mmap, - use_mlock: args.use_mlock, - flash_attention: args.flash_attention, - type_k: args.type_k, - type_v: args.type_v, - offload_kqv: args.offload_kqv, + n_gpu_layers: args.n_gpu_layers, + split_mode: args.split_mode, + defrag_threshold: args.defrag_threshold, + numa: args.numa, + use_mmap: args.use_mmap, + use_mlock: args.use_mlock, + flash_attention: args.flash_attention, + type_k: args.type_k, + type_v: args.type_v, + offload_kqv: args.offload_kqv, max_batch_total_tokens, max_physical_batch_total_tokens, max_batch_size, - batch_timeout: tokio::time::Duration::from_millis(5), + batch_timeout: tokio::time::Duration::from_millis(5), }, tokenizer, ); @@ -261,7 +261,7 @@ async fn main() -> Result<(), RouterError> { args.max_input_tokens, args.max_total_tokens, args.validation_workers, - None, // api_key + None, // api_key args.model_id, // tokenizer_name args.tokenizer_config_path, Some(args.revision), From 3b1b049b321290f641db2be409f06467faff34e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 18:33:30 +0000 Subject: [PATCH 52/63] Enable KQV offload by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 753138f9919..e8aa579ff5b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -60,7 +60,7 @@ struct Args { use_mlock: bool, /// Enable offloading of KQV operations to the GPU. - #[clap(default_value = "false", long, env)] + #[clap(default_value = "true", long, env)] offload_kqv: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) From acca9c3e000a12c2c47255118cf1133e3d28a8bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Thu, 6 Feb 2025 18:34:06 +0000 Subject: [PATCH 53/63] Remove Ngrok tunneling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index e8aa579ff5b..a8edc081726 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -127,18 +127,6 @@ struct Args { #[clap(long, env)] cors_allow_origin: Option>, - /// Enable Ngrok tunneling. - #[clap(long, env)] - ngrok: bool, - - /// Ngrok authentication token. - #[clap(long, env)] - ngrok_authtoken: Option, - - /// Ngrok edge to use for tunneling. - #[clap(long, env)] - ngrok_edge: Option, - /// Path to the tokenizer configuration file. #[clap(long, env)] tokenizer_config_path: Option, @@ -269,9 +257,9 @@ async fn main() -> Result<(), RouterError> { args.hostname, args.port, args.cors_allow_origin, - args.ngrok, - args.ngrok_authtoken, - args.ngrok_edge, + false, // ngrok, + None, // ngrok_authtoken, + None, // ngrok_edge, args.disable_grammar_support, args.max_client_batch_size, args.usage_stats, From 0d27ee74de888bff27f667238dec4ed9c1c263ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 08:51:32 +0000 Subject: [PATCH 54/63] Remove .cargo/config.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/.cargo/config.toml | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 backends/llamacpp/.cargo/config.toml diff --git a/backends/llamacpp/.cargo/config.toml b/backends/llamacpp/.cargo/config.toml deleted file mode 100644 index ddff4407b90..00000000000 --- a/backends/llamacpp/.cargo/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] -rustflags = ["-C", "target-cpu=native"] From 4841f71a0ef531368d9a9d71cdd3d2face654b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 12:26:28 +0100 Subject: [PATCH 55/63] Fix Dockerfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 1 - backends/llamacpp/build.rs | 15 +++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 6fba85e9d12..7404ed4b9d4 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -3,7 +3,6 @@ FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu24.04 AS deps ARG llamacpp_version=b4651 ARG llamacpp_cuda=OFF ARG cuda_arch=75-real;80-real;86-real;89-real;90-real -ENV TGI_LLAMA_PKG_CUDA=cuda-${CUDA_VERSION%.*} WORKDIR /opt/src diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index aa2a0d8716b..b554694b9b2 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -31,10 +31,18 @@ impl ParseCallbacks for PrefixStripper { } fn main() { - let pkg_cuda = option_env!("TGI_LLAMA_PKG_CUDA"); let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH"); let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu"); + if let Some(cuda_version) = option_env!("CUDA_VERSION") { + let mut version: Vec<&str> = cuda_version.split('.').collect(); + if version.len() > 2 { + version.pop(); + } + pkg_config::Config::new().probe(&version.join(".")).unwrap(); + } + pkg_config::Config::new().probe("llama").unwrap(); + let bindings = bindgen::Builder::default() .header("src/wrapper.h") .prepend_enum_name(false) @@ -48,10 +56,5 @@ fn main() { .write_to_file(out_path.join("llamacpp.rs")) .expect("Couldn't write bindings!"); - if let Some(pkg_cuda) = pkg_cuda { - pkg_config::Config::new().probe(pkg_cuda).unwrap(); - } - pkg_config::Config::new().probe("llama").unwrap(); - inject_transient_dependencies(lib_search_path, lib_target_hardware); } From b6cfa0fbc07cc2b0d5e97490438757da42addf8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 11:48:16 +0000 Subject: [PATCH 56/63] Add missing cuda prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/build.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index b554694b9b2..19b1987d947 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -39,7 +39,8 @@ fn main() { if version.len() > 2 { version.pop(); } - pkg_config::Config::new().probe(&version.join(".")).unwrap(); + let cuda_version = format!("cuda-{}", version.join(".")); + pkg_config::Config::new().probe(&cuda_version).unwrap(); } pkg_config::Config::new().probe("llama").unwrap(); From 6bdb644f2c2d69fe8928274af4f50ec1056de6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 12:08:02 +0000 Subject: [PATCH 57/63] Handle custom llama.cpp dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/build.rs | 39 +++++++++++---------------------- backends/llamacpp/src/wrapper.h | 1 - 2 files changed, 13 insertions(+), 27 deletions(-) delete mode 100644 backends/llamacpp/src/wrapper.h diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 19b1987d947..499583cd445 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -1,26 +1,7 @@ use bindgen::callbacks::{ItemInfo, ParseCallbacks}; -use std::collections::HashMap; use std::env; use std::path::PathBuf; -fn inject_transient_dependencies(lib_search_path: Option<&str>, lib_target_hardware: &str) { - let hardware_targets = HashMap::from([("cpu", None), ("cuda", Some(vec!["cuda"]))]); - - if let Some(lib_search_path) = lib_search_path { - lib_search_path.split(":").for_each(|path| { - println!("cargo:rustc-link-search=dependency={path}"); - }); - } - - if let Some(hardware_transient_deps) = hardware_targets.get(lib_target_hardware) { - if let Some(additional_transient_deps) = hardware_transient_deps { - additional_transient_deps.iter().for_each(|dep| { - println!("cargo:rustc-link-lib={dep}"); - }); - } - } -} - #[derive(Debug)] struct PrefixStripper; @@ -31,9 +12,6 @@ impl ParseCallbacks for PrefixStripper { } fn main() { - let lib_search_path = option_env!("TGI_LLAMA_LD_LIBRARY_PATH"); - let lib_target_hardware = option_env!("TGI_LLAMA_HARDWARE_TARGET").unwrap_or("cpu"); - if let Some(cuda_version) = option_env!("CUDA_VERSION") { let mut version: Vec<&str> = cuda_version.split('.').collect(); if version.len() > 2 { @@ -42,10 +20,21 @@ fn main() { let cuda_version = format!("cuda-{}", version.join(".")); pkg_config::Config::new().probe(&cuda_version).unwrap(); } - pkg_config::Config::new().probe("llama").unwrap(); + let llama = pkg_config::Config::new().probe("llama").unwrap(); + + for path in &llama.link_paths { + println!("cargo:rustc-link-arg=-Wl,-rpath,{}", path.display()); + } + println!("cargo:rustc-link-arg=-Wl,--disable-new-dtags"); let bindings = bindgen::Builder::default() - .header("src/wrapper.h") + .clang_args( + llama + .include_paths + .iter() + .map(|p| format!("-I{}", p.display())), + ) + .header_contents("llama_bindings.h", "#include ") .prepend_enum_name(false) .parse_callbacks(Box::new(PrefixStripper)) .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) @@ -56,6 +45,4 @@ fn main() { bindings .write_to_file(out_path.join("llamacpp.rs")) .expect("Couldn't write bindings!"); - - inject_transient_dependencies(lib_search_path, lib_target_hardware); } diff --git a/backends/llamacpp/src/wrapper.h b/backends/llamacpp/src/wrapper.h deleted file mode 100644 index 630ebeec15a..00000000000 --- a/backends/llamacpp/src/wrapper.h +++ /dev/null @@ -1 +0,0 @@ -#include From 0702e0bfda290ff0a6c46825df6d363838abd3ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 12:08:34 +0000 Subject: [PATCH 58/63] Cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/backend.rs | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index dd873f6e5da..1566e1bf968 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -91,7 +91,7 @@ pub enum LlamacppGGMLType { // TODO: macro impl LlamacppGGMLType { - fn to_ggml_type(&self) -> llamacpp::ggml_type { + fn to_ggml_type(self) -> llamacpp::ggml_type { match self { LlamacppGGMLType::F32 => llamacpp::GGML_TYPE_F32, LlamacppGGMLType::F16 => llamacpp::GGML_TYPE_F16, @@ -342,19 +342,21 @@ impl LlamacppSampler { error!("Failed to init sampler"); return None; } - let top_k = unsafe { llamacpp::sampler_init_top_k(req.top_k) }; - let top_p = unsafe { llamacpp::sampler_init_top_p(req.top_p, req.min_keep) }; - let typical_p = unsafe { llamacpp::sampler_init_typical(req.typical_p, req.min_keep) }; - let temp = unsafe { llamacpp::sampler_init_temp(req.temp) }; - let penalties = unsafe { - llamacpp::sampler_init_penalties( - req.penalty_last_n, - req.penalty_repeat, - req.penalty_freq, - req.penalty_present, + let (top_k, top_p, typical_p, temp, penalties, dist) = unsafe { + ( + llamacpp::sampler_init_top_k(req.top_k), + llamacpp::sampler_init_top_p(req.top_p, req.min_keep), + llamacpp::sampler_init_typical(req.typical_p, req.min_keep), + llamacpp::sampler_init_temp(req.temp), + llamacpp::sampler_init_penalties( + req.penalty_last_n, + req.penalty_repeat, + req.penalty_freq, + req.penalty_present, + ), + llamacpp::sampler_init_dist(req.seed), ) }; - let dist = unsafe { llamacpp::sampler_init_dist(req.seed) }; let all = &[ ("top_k", top_k), ("top_p", top_p), From 508d47f80ffa43ce0e7d097af0778bbc9c500300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 12:12:13 +0000 Subject: [PATCH 59/63] Add README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 backends/llamacpp/README.md diff --git a/backends/llamacpp/README.md b/backends/llamacpp/README.md new file mode 100644 index 00000000000..0971efc5a39 --- /dev/null +++ b/backends/llamacpp/README.md @@ -0,0 +1,24 @@ +# Llamacpp backend + +If all your dependencies are installed at the system level, running +cargo build should be sufficient. However, if you want to experiment +with different versions of llama.cpp, some additional setup is required. + +## Install llama.cpp + + LLAMACPP_PREFIX=$(pwd)/llama.cpp.out + + git clone https://github.com/ggerganov/llama.cpp + cd llama.cpp + cmake -B build \ + -DCMAKE_INSTALL_PREFIX="$LLAMACPP_PREFIX" \ + -DLLAMA_BUILD_COMMON=OFF \ + -DLLAMA_BUILD_TESTS=OFF \ + -DLLAMA_BUILD_EXAMPLES=OFF \ + -DLLAMA_BUILD_SERVER=OFF + cmake --build build --config Release -j + cmake --install build + +## Build TGI + + PKG_CONFIG_PATH="$LLAMACPP_PREFIX/lib/pkgconfig" cargo build From 14014182432732a83e63b1989554c25d282d1267 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 14:45:53 +0000 Subject: [PATCH 60/63] Add HF transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- Dockerfile_llamacpp | 2 ++ backends/llamacpp/requirements.txt | 1 + 2 files changed, 3 insertions(+) diff --git a/Dockerfile_llamacpp b/Dockerfile_llamacpp index 7404ed4b9d4..2eb62a1f66c 100644 --- a/Dockerfile_llamacpp +++ b/Dockerfile_llamacpp @@ -71,4 +71,6 @@ COPY --from=builder /usr/lib/libllama.so /usr/lib/ COPY --from=builder /usr/lib/libggml*.so /usr/lib/ COPY --from=builder /app/target/release-opt/text-generation-router-llamacpp /usr/bin/ +ENV HF_HUB_ENABLE_HF_TRANSFER=1 + ENTRYPOINT ["text-generation-router-llamacpp"] diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt index cdce2da2002..5c5d0cc7f11 100644 --- a/backends/llamacpp/requirements.txt +++ b/backends/llamacpp/requirements.txt @@ -1,2 +1,3 @@ transformers==4.48.2 huggingface-hub==0.28.1 +hf-transfer==0.1.9 From b77d05d3af82fa3e7d9323b91aaf9438976aa595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 15:29:05 +0000 Subject: [PATCH 61/63] Fix bool args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- backends/llamacpp/src/main.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index a8edc081726..5a07acdcde9 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -52,19 +52,19 @@ struct Args { numa: LlamacppNuma, /// Use memory mapping for the model. - #[clap(default_value = "true", long, env)] + #[clap(long, env)] use_mmap: bool, /// Use memory locking to prevent swapping. - #[clap(default_value = "false", long, env)] + #[clap(long, env)] use_mlock: bool, /// Enable offloading of KQV operations to the GPU. - #[clap(default_value = "true", long, env)] + #[clap(long, env)] offload_kqv: bool, /// Enable flash attention for faster inference. (EXPERIMENTAL) - #[clap(default_value = "true", long, env)] + #[clap(long, env)] flash_attention: bool, /// Data type used for K cache. @@ -132,7 +132,7 @@ struct Args { tokenizer_config_path: Option, /// Disable grammar support. - #[clap(long, env, default_value_t = false)] + #[clap(long, env)] disable_grammar_support: bool, /// Maximum number of inputs per request. From d96a77705dd314e5354af80547a190181ed6a385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 16:48:28 +0000 Subject: [PATCH 62/63] Update doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- docs/source/backends/llamacpp.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md index f5aeb52c06f..dd4ef7b74f1 100644 --- a/docs/source/backends/llamacpp.md +++ b/docs/source/backends/llamacpp.md @@ -101,8 +101,10 @@ The table below summarizes key options: | `--split-mode` | Split the model across multiple GPUs | | `--defrag-threshold` | Defragment the KV cache if holes/size > threshold | | `--numa` | Enable NUMA optimizations | +| `--use-mmap` | Use memory mapping for the model | | `--use-mlock` | Use memory locking to prevent swapping | | `--offload-kqv` | Enable offloading of KQV operations to the GPU | +| `--flash-attention` | Enable flash attention for faster inference | | `--type-k` | Data type used for K cache | | `--type-v` | Data type used for V cache | | `--validation-workers` | Number of tokenizer workers used for payload validation and truncation | From 5fb4afbf5e5ab36efdf3dfc50cea3ebc86ee23c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrien=20Gallou=C3=ABt?= Date: Fri, 7 Feb 2025 17:41:14 +0000 Subject: [PATCH 63/63] Update doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Adrien Gallouët --- docs/source/backends/llamacpp.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/backends/llamacpp.md b/docs/source/backends/llamacpp.md index dd4ef7b74f1..dbd93e86606 100644 --- a/docs/source/backends/llamacpp.md +++ b/docs/source/backends/llamacpp.md @@ -51,7 +51,7 @@ Retrieve a GGUF model and store it in a specific directory, for example: ```bash mkdir -p ~/models cd ~/models -curl -O "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true" +curl -LOJ "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_0.gguf?download=true" ``` ## Run Docker image