From c38a7d7ddd9c612e368adec1ef94583be602fc7e Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Fri, 12 Apr 2024 18:38:34 +0200 Subject: [PATCH] v2.0.0 (#1736) --- Cargo.lock | 213 ++++++++---------- Cargo.toml | 2 +- README.md | 6 +- docs/openapi.json | 2 +- .../test_flash_llama_grammar_no_tools.json | 2 +- .../test_flash_llama_grammar_tools.json | 2 +- .../test_flash_llama_grammar_tools_auto.json | 2 +- ...test_flash_llama_grammar_tools_choice.json | 2 +- ...test_flash_llama_grammar_tools_stream.json | 2 +- integration-tests/pyproject.toml | 2 +- server/pyproject.toml | 2 +- server/text_generation_server/interceptor.py | 4 + .../models/cache_manager.py | 7 +- server/text_generation_server/utils/layers.py | 9 + 14 files changed, 124 insertions(+), 133 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8c1784bf..fd6baab5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,9 +96,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.81" +version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" +checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" [[package]] name = "arbitrary" @@ -120,7 +120,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -159,18 +159,18 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] name = "async-trait" -version = "0.1.79" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507401cad91ec6a857ed5513a2073c82a9b9048762b885bb98655b306964681" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -375,15 +375,15 @@ dependencies = [ [[package]] name = "built" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d17f4d6e4dc36d1a02fbedc2753a096848e7c1b0772f7654eab8e2c927dd53" +checksum = "41bfbdb21256b87a8b5e80fab81a8eed158178e812fd7ba451907518b2742f16" [[package]] name = "bumpalo" -version = "3.15.4" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytecount" @@ -449,9 +449,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" [[package]] name = "cc" -version = "1.0.90" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" +checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41" dependencies = [ "jobserver", "libc", @@ -459,9 +459,9 @@ dependencies = [ [[package]] name = "cfg-expr" -version = "0.15.7" +version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa50868b64a9a6fda9d593ce778849ea8715cd2a3d2cc17ffdb4a2f2f2f1961d" +checksum = "d067ad48b8650848b989a59a86c6c36a995d02d2bf778d45c3c5d57bc2718f02" dependencies = [ "smallvec", "target-lexicon", @@ -498,7 +498,7 @@ dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim 0.11.0", + "strsim 0.11.1", ] [[package]] @@ -510,7 +510,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -812,9 +812,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" [[package]] name = "encoding_rs" -version = "0.8.33" +version = "0.8.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" dependencies = [ "cfg-if", ] @@ -1018,7 +1018,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -1072,9 +1072,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" dependencies = [ "cfg-if", "js-sys", @@ -1111,9 +1111,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.25" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fbd2820c5e49886948654ab546d0688ff24530286bdcf8fca3cefb16d4618eb" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", @@ -1198,15 +1198,6 @@ dependencies = [ "ureq", ] -[[package]] -name = "home" -version = "0.5.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" -dependencies = [ - "windows-sys 0.52.0", -] - [[package]] name = "hostname" version = "0.3.1" @@ -1432,7 +1423,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -1485,9 +1476,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" +checksum = "f08474e32172238f2827bd160c67871cdb2801430f65c3979184dc362e3ca118" dependencies = [ "libc", ] @@ -1574,13 +1565,12 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "libredox" -version = "0.0.1" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85c833ca1e66078851dba29046874e38f08b2c883700aa29a03ddd3b23814ee8" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" dependencies = [ "bitflags 2.5.0", "libc", - "redox_syscall", ] [[package]] @@ -1713,7 +1703,7 @@ checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -1801,14 +1791,14 @@ checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] name = "multimap" -version = "0.8.3" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" [[package]] name = "muxado" @@ -1991,7 +1981,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -2121,7 +2111,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -2337,14 +2327,14 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -2396,7 +2386,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -2448,7 +2438,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd" dependencies = [ "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -2463,34 +2453,33 @@ dependencies = [ [[package]] name = "prost" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a" +checksum = "d0f5d036824e4761737860779c906171497f6d55681139d8312388f8fe398922" dependencies = [ "bytes", - "prost-derive 0.12.3", + "prost-derive 0.12.4", ] [[package]] name = "prost-build" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" +checksum = "80b776a1b2dc779f5ee0641f8ade0125bc1298dd41a9a0c16d8bd57b42d222b1" dependencies = [ "bytes", - "heck 0.4.1", - "itertools 0.11.0", + "heck 0.5.0", + "itertools 0.12.1", "log", "multimap", "once_cell", "petgraph", "prettyplease", - "prost 0.12.3", + "prost 0.12.4", "prost-types", "regex", - "syn 2.0.55", + "syn 2.0.58", "tempfile", - "which", ] [[package]] @@ -2508,24 +2497,24 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" +checksum = "19de2de2a00075bf566bee3bd4db014b11587e84184d3f7a791bc17f1a8e9e48" dependencies = [ "anyhow", - "itertools 0.11.0", + "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] name = "prost-types" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e" +checksum = "3235c33eb02c1f1e212abdbe34c78b264b038fb58ca612664343271e36e55ffe" dependencies = [ - "prost 0.12.3", + "prost 0.12.4", ] [[package]] @@ -2561,9 +2550,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -2716,9 +2705,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a18479200779601e498ada4e8c1e1f50e3ee19deb0259c25825a98b5603b2cb4" +checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" dependencies = [ "getrandom", "libredox", @@ -2875,7 +2864,7 @@ dependencies = [ "quote", "rust-embed-utils", "shellexpand", - "syn 2.0.55", + "syn 2.0.58", "walkdir", ] @@ -2971,9 +2960,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" +checksum = "80af6f9131f277a45a3fba6ce8e2258037bb0477a67e610d3c1fe046ab31de47" [[package]] name = "ryu" @@ -3017,9 +3006,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "2.9.2" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" dependencies = [ "bitflags 1.3.2", "core-foundation", @@ -3030,9 +3019,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" dependencies = [ "core-foundation-sys", "libc", @@ -3064,7 +3053,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -3258,9 +3247,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strsim" -version = "0.11.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" @@ -3281,7 +3270,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -3303,9 +3292,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.55" +version = "2.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" +checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" dependencies = [ "proc-macro2", "quote", @@ -3320,9 +3309,9 @@ checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" [[package]] name = "sysinfo" -version = "0.30.7" +version = "0.30.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c385888ef380a852a16209afc8cfad22795dd8873d69c9a14d2e2088f118d18" +checksum = "26d7c217777061d5a2d652aea771fb9ba98b6dade657204b08c4b9604d11555b" dependencies = [ "cfg-if", "core-foundation-sys", @@ -3410,7 +3399,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "1.4.5" +version = "2.0.0" dependencies = [ "average", "clap", @@ -3431,11 +3420,11 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "1.4.5" +version = "2.0.0" dependencies = [ "futures", "grpc-metadata", - "prost 0.12.3", + "prost 0.12.4", "prost-build", "thiserror", "tokio", @@ -3447,7 +3436,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "1.4.5" +version = "2.0.0" dependencies = [ "clap", "ctrlc", @@ -3465,7 +3454,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "1.4.5" +version = "2.0.0" dependencies = [ "async-stream", "axum", @@ -3522,7 +3511,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -3548,9 +3537,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.34" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ "deranged", "itoa", @@ -3571,9 +3560,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ "num-conv", "time-core", @@ -3699,7 +3688,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -3829,7 +3818,7 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost 0.12.3", + "prost 0.12.4", "tokio", "tokio-stream", "tower", @@ -3848,7 +3837,7 @@ dependencies = [ "proc-macro2", "prost-build", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -3921,7 +3910,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -4162,7 +4151,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] @@ -4284,7 +4273,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", "wasm-bindgen-shared", ] @@ -4318,7 +4307,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4364,18 +4353,6 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a85b86a771b1c87058196170769dd264f66c0782acf1ae6cc51bfd64b39082" -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix", -] - [[package]] name = "winapi" version = "0.3.9" @@ -4626,9 +4603,9 @@ checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" [[package]] name = "winnow" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dffa400e67ed5a4dd237983829e66475f0a4a26938c4b04c21baede6262215b8" +checksum = "f0c976aaaa0e1f90dbb21e9587cdaf1d9679a1cde8875c0d6bd83ab96a208352" dependencies = [ "memchr", ] @@ -4660,7 +4637,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.55", + "syn 2.0.58", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a9e7f1c8..ecb4878f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.4.5" +version = "2.0.0" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/README.md b/README.md index bffe1e8a..ad66e328 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ For a detailed starting guide, please see the [Quick Tour](https://huggingface.c model=HuggingFaceH4/zephyr-7b-beta volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model +docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model ``` And then you can make requests like @@ -90,7 +90,7 @@ curl 127.0.0.1:8080/generate_stream \ **Note:** To use NVIDIA GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 12.2 or higher. For running the Docker container on a machine with no GPUs or CUDA support, it is enough to remove the `--gpus all` flag and add `--disable-custom-kernels`, please note CPU is not the intended platform for this project, so performance might be subpar. -**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4-rocm --model-id $model` instead of the command above. +**Note:** TGI supports AMD Instinct MI210 and MI250 GPUs. Details can be found in the [Supported Hardware documentation](https://huggingface.co/docs/text-generation-inference/supported_models#supported-hardware). To use AMD GPUs, please use `docker run --device /dev/kfd --device /dev/dri --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0-rocm --model-id $model` instead of the command above. To see all options to serve your models (in the [code](https://github.com/huggingface/text-generation-inference/blob/main/launcher/src/main.rs) or in the cli): ``` @@ -120,7 +120,7 @@ model=meta-llama/Llama-2-7b-chat-hf volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run token= -docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:1.4 --model-id $model +docker run --gpus all --shm-size 1g -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:2.0 --model-id $model ``` ### A note on Shared Memory (shm) diff --git a/docs/openapi.json b/docs/openapi.json index fdf1c804..34b030f2 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.4.5" + "version": "2.0.0" }, "paths": { "/": { diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json index 543be115..153a508d 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json @@ -17,7 +17,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 100, "prompt_tokens": 60, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json index 728e90a4..56920b3e 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json index 2e0efb86..fe679362 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json index 91854223..e48a1e7d 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json @@ -30,7 +30,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native", + "system_fingerprint": "2.0.0-native", "usage": { "completion_tokens": 21, "prompt_tokens": 187, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json index e0c7aed6..cfebc05f 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json @@ -23,5 +23,5 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.5-native" + "system_fingerprint": "2.0.0-native" } diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index ad217072..6923ff23 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-integration-tests" -version = "1.4.5" +version = "2.0.0" description = "Text Generation Inference integration tests" authors = ["Nicolas Patry "] diff --git a/server/pyproject.toml b/server/pyproject.toml index 58c54b94..e7c576a8 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "1.4.5" +version = "2.0.0" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "] diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py index 725105f3..57df1725 100644 --- a/server/text_generation_server/interceptor.py +++ b/server/text_generation_server/interceptor.py @@ -23,6 +23,10 @@ class ExceptionInterceptor(AsyncServerInterceptor): method_name = method_name.split("/")[-1] logger.exception(f"Method {method_name} encountered an error.") + # Runtime Error cannot be recovered from + if isinstance(err, RuntimeError): + exit(1) + if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py index 4be8b1b9..85e1b19b 100644 --- a/server/text_generation_server/models/cache_manager.py +++ b/server/text_generation_server/models/cache_manager.py @@ -55,9 +55,10 @@ class CacheManager: ): # Get free blocks indices by finding values in mask that are not set to 0 free_block_indices = self.free_block_mask.nonzero() - assert ( - len(free_block_indices) >= blocks - ), f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks" + if blocks > len(free_block_indices): + raise RuntimeError( + f"Out of available cache blocks: asked {blocks}, only {len(free_block_indices)} free blocks" + ) # Slice by the number of required blocks block_indices = free_block_indices[:blocks] diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index 2c12984b..9cf5c80f 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -503,6 +503,10 @@ class MedusaHeadV1(nn.Module): self, input: torch.Tensor ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: logits = self.lm_head(input) + # If we have too many tokens, we skip speculative logits + if input.shape[0] > 128: + return logits, None + speculative_logits = self.medusa(input) return logits, speculative_logits @@ -549,6 +553,11 @@ class MedusaHeadV2(nn.Module): self.lm_head = TensorParallelHead.load(config, prefix, weights) def forward(self, x): + # If we have too many tokens, we skip speculative logits + if x.shape[0] > 128: + logits = self.lm_head(x) + return logits, None + size = x.shape[-1] block_size = (size + self.world_size - 1) // self.world_size start = self.rank * block_size