diff --git a/Cargo.lock b/Cargo.lock index 2df5d320..8078d24f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,9 +96,9 @@ checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" [[package]] name = "arc-swap" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b3d0060af21e8d11a926981cc00c6c1541aa91dd64b9f881985c3da1094425f" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" [[package]] name = "async-rustls" @@ -130,25 +130,25 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] name = "async-trait" -version = "0.1.78" +version = "0.1.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "461abc97219de0eaaf81fe3ef974a540158f3d079c2ab200f891f1a2ef201e85" +checksum = "a507401cad91ec6a857ed5513a2073c82a9b9048762b885bb98655b306964681" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] name = "autocfg" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" [[package]] name = "average" @@ -242,9 +242,9 @@ dependencies = [ [[package]] name = "backtrace" -version = "0.3.70" +version = "0.3.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95d8e92cac0961e91dbd517496b00f7e9b92363dbe6d42c3198268323798860c" +checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" dependencies = [ "addr2line", "cc", @@ -323,9 +323,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "camino" @@ -385,9 +385,9 @@ checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" [[package]] name = "clap" -version = "4.5.3" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "949626d00e063efc93b6dca932419ceb5432f99769911c0b995f7e884c778813" +checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" dependencies = [ "clap_builder", "clap_derive", @@ -407,14 +407,14 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.3" +version = "4.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90239a040c80f5e14809ca132ddc4176ab33d5e17e49691793296e3fcb34d72f" +checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -748,9 +748,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.1" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" [[package]] name = "fixedbitset" @@ -876,7 +876,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -969,7 +969,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap 2.2.5", + "indexmap 2.2.6", "slab", "tokio", "tokio-util", @@ -1173,9 +1173,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.5" +version = "2.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" dependencies = [ "equivalent", "hashbrown 0.14.3", @@ -1197,9 +1197,9 @@ dependencies = [ [[package]] name = "indoc" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" [[package]] name = "init-tracing-opentelemetry" @@ -1267,9 +1267,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "js-sys" @@ -1409,9 +1409,9 @@ checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" [[package]] name = "metrics" @@ -1450,7 +1450,7 @@ checksum = "38b4faf00617defe497754acde3024865bc143d44a86799b24e191ecff91354f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1486,8 +1486,8 @@ dependencies = [ [[package]] name = "minijinja" -version = "1.0.15" -source = "git+https://github.com/mitsuhiko/minijinja.git?branch=main#045d6a0f7ee1ea9cb4aa7725bc8fb1c39ab1d90b" +version = "1.0.16" +source = "git+https://github.com/mitsuhiko/minijinja.git?branch=main#82d0160b5513844e5429db084f1cbdd3313ed482" dependencies = [ "serde", ] @@ -1537,7 +1537,7 @@ checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1834,7 +1834,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -1845,9 +1845,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.101" +version = "0.9.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" dependencies = [ "cc", "libc", @@ -2030,7 +2030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" dependencies = [ "fixedbitset", - "indexmap 2.2.5", + "indexmap 2.2.6", ] [[package]] @@ -2050,7 +2050,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -2091,12 +2091,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7" dependencies = [ "proc-macro2", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -2169,7 +2169,7 @@ dependencies = [ "prost 0.12.3", "prost-types", "regex", - "syn 2.0.53", + "syn 2.0.55", "tempfile", "which", ] @@ -2197,7 +2197,7 @@ dependencies = [ "itertools 0.11.0", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -2292,9 +2292,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -2343,14 +2343,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" dependencies = [ "aho-corasick", "memchr", "regex-automata 0.4.6", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -2370,7 +2370,7 @@ checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", ] [[package]] @@ -2387,9 +2387,9 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" [[package]] name = "reqwest" @@ -2482,7 +2482,7 @@ dependencies = [ "quote", "rust-embed-utils", "shellexpand", - "syn 2.0.53", + "syn 2.0.55", "walkdir", ] @@ -2538,9 +2538,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.22.2" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41" +checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" dependencies = [ "log", "ring 0.17.8", @@ -2561,9 +2561,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.1" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" +checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" @@ -2671,14 +2671,14 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] name = "serde_json" -version = "1.0.114" +version = "1.0.115" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" +checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" dependencies = [ "itoa", "ryu", @@ -2861,7 +2861,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -2883,9 +2883,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.53" +version = "2.0.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7383cd0e49fff4b6b90ca5670bfd3e9d6a733b3f90c686605aa7eec8c4996032" +checksum = "002a1b3dbf967edfafc32655d0f377ab0bb7b994aa1d32c8cc7e9b8bf3ebb8f0" dependencies = [ "proc-macro2", "quote", @@ -2971,7 +2971,7 @@ dependencies = [ [[package]] name = "text-generation-benchmark" -version = "1.4.4" +version = "1.4.5" dependencies = [ "average", "clap", @@ -2992,7 +2992,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "1.4.4" +version = "1.4.5" dependencies = [ "futures", "grpc-metadata", @@ -3008,7 +3008,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "1.4.4" +version = "1.4.5" dependencies = [ "clap", "ctrlc", @@ -3024,7 +3024,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "1.4.4" +version = "1.4.5" dependencies = [ "async-stream", "axum", @@ -3079,7 +3079,7 @@ checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -3198,7 +3198,7 @@ dependencies = [ "rayon", "rayon-cond", "regex", - "regex-syntax 0.8.2", + "regex-syntax 0.8.3", "serde", "serde_json", "spm_precompiled", @@ -3210,9 +3210,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", @@ -3245,7 +3245,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -3360,7 +3360,7 @@ dependencies = [ "proc-macro2", "prost-build", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -3433,7 +3433,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -3620,7 +3620,7 @@ dependencies = [ "log", "native-tls", "once_cell", - "rustls 0.22.2", + "rustls 0.22.3", "rustls-pki-types", "rustls-webpki", "serde", @@ -3658,7 +3658,7 @@ version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d82b1bc5417102a73e8464c686eef947bdfb99fcdfc0a4f228e81afa9526470a" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.2.6", "serde", "serde_json", "utoipa-gen", @@ -3674,7 +3674,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] @@ -3779,7 +3779,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", "wasm-bindgen-shared", ] @@ -3813,7 +3813,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4140,7 +4140,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.53", + "syn 2.0.55", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d76cbc68..77a30f55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ members = [ resolver = "2" [workspace.package] -version = "1.4.4" +version = "1.4.5" edition = "2021" authors = ["Olivier Dehaene"] homepage = "https://github.com/huggingface/text-generation-inference" diff --git a/benchmark/src/app.rs b/benchmark/src/app.rs index b27c56b4..48ac976a 100644 --- a/benchmark/src/app.rs +++ b/benchmark/src/app.rs @@ -444,7 +444,7 @@ fn progress_gauge(title: &str, label: String, progress: f64, color: Color) -> Ga } /// Throughput paragraph -fn throughput_paragraph<'a>(throughput: &Vec, name: &'static str) -> Paragraph<'a> { +fn throughput_paragraph<'a>(throughput: &[f64], name: &'static str) -> Paragraph<'a> { // Throughput average/high/low texts let throughput_texts = statis_spans(throughput, "tokens/secs"); @@ -457,7 +457,7 @@ fn throughput_paragraph<'a>(throughput: &Vec, name: &'static str) -> Paragr } /// Latency paragraph -fn latency_paragraph<'a>(latency: &mut Vec, name: &'static str) -> Paragraph<'a> { +fn latency_paragraph<'a>(latency: &mut [f64], name: &'static str) -> Paragraph<'a> { // Latency average/high/low texts let mut latency_texts = statis_spans(latency, "ms"); @@ -483,7 +483,7 @@ fn latency_paragraph<'a>(latency: &mut Vec, name: &'static str) -> Paragrap } /// Average/High/Low spans -fn statis_spans<'a>(data: &Vec, unit: &'static str) -> Vec> { +fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec> { vec![ Line::from(vec![Span::styled( format!( @@ -543,7 +543,7 @@ fn latency_histogram<'a>( /// Latency/Throughput chart fn latency_throughput_chart<'a>( - latency_throughput: &'a Vec<(f64, f64)>, + latency_throughput: &'a [(f64, f64)], batch_sizes: &'a [u32], zoom: bool, name: &'static str, diff --git a/benchmark/src/table.rs b/benchmark/src/table.rs index c4819ff3..e18d7310 100644 --- a/benchmark/src/table.rs +++ b/benchmark/src/table.rs @@ -151,7 +151,7 @@ fn add_throuhgputs( } } -fn avg_min_max(data: &Vec) -> (f64, f64, f64) { +fn avg_min_max(data: &[f64]) -> (f64, f64, f64) { let average = data.iter().sum::() / data.len() as f64; let min = data .iter() @@ -164,7 +164,7 @@ fn avg_min_max(data: &Vec) -> (f64, f64, f64) { (average, *min, *max) } -fn px(data: &Vec, p: u32) -> f64 { +fn px(data: &[f64], p: u32) -> f64 { let i = (f64::from(p) / 100.0 * data.len() as f64) as usize; *data.get(i).unwrap_or(&std::f64::NAN) } diff --git a/docs/openapi.json b/docs/openapi.json index 75965d98..fdf1c804 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -10,7 +10,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "1.4.4" + "version": "1.4.5" }, "paths": { "/": { @@ -471,6 +471,90 @@ } } } + }, + "/v1/completions": { + "post": { + "tags": [ + "Text Generation Inference" + ], + "summary": "Generate tokens", + "description": "Generate tokens", + "operationId": "completions", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CompletionRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Generated Text", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ChatCompletionChunk" + } + } + } + }, + "422": { + "description": "Input validation error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Input validation error" + } + } + } + }, + "424": { + "description": "Generation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Request failed during generation" + } + } + } + }, + "429": { + "description": "Model is overloaded", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Model is overloaded" + } + } + } + }, + "500": { + "description": "Incomplete generation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Incomplete generation" + } + } + } + } + } + } } }, "components": { @@ -669,17 +753,25 @@ "ChatCompletionDelta": { "type": "object", "required": [ - "role", - "content" + "role" ], "properties": { "content": { "type": "string", - "example": "What is Deep Learning?" + "example": "What is Deep Learning?", + "nullable": true }, "role": { "type": "string", "example": "user" + }, + "tool_calls": { + "allOf": [ + { + "$ref": "#/components/schemas/DeltaToolCall" + } + ], + "nullable": true } } }, @@ -739,7 +831,8 @@ "ChatRequest": { "type": "object", "required": [ - "model" + "model", + "messages" ], "properties": { "frequency_penalty": { @@ -777,11 +870,12 @@ "items": { "$ref": "#/components/schemas/Message" }, - "description": "A list of messages comprising the conversation so far." + "description": "A list of messages comprising the conversation so far.", + "example": "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]" }, "model": { "type": "string", - "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.", + "description": "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.", "example": "mistralai/Mistral-7B-Instruct-v0.2" }, "n": { @@ -806,6 +900,15 @@ "nullable": true, "minimum": 0 }, + "stop": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Up to 4 sequences where the API will stop generating further tokens.", + "example": "null", + "nullable": true + }, "stream": { "type": "boolean" }, @@ -816,6 +919,29 @@ "example": 1.0, "nullable": true }, + "tool_choice": { + "allOf": [ + { + "$ref": "#/components/schemas/ToolType" + } + ], + "nullable": true + }, + "tool_prompt": { + "type": "string", + "description": "A prompt to be appended before the tools", + "example": "\"Based on the conversation, please choose the most appropriate tool to use: \"", + "nullable": true + }, + "tools": { + "type": "array", + "items": { + "$ref": "#/components/schemas/Tool" + }, + "description": "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for.", + "example": "null", + "nullable": true + }, "top_logprobs": { "type": "integer", "format": "int32", @@ -852,6 +978,164 @@ } } }, + "CompletionComplete": { + "type": "object", + "required": [ + "index", + "text", + "finish_reason" + ], + "properties": { + "finish_reason": { + "type": "string" + }, + "index": { + "type": "integer", + "format": "int32", + "minimum": 0 + }, + "logprobs": { + "type": "array", + "items": { + "type": "number", + "format": "float" + }, + "nullable": true + }, + "text": { + "type": "string" + } + } + }, + "CompletionCompleteChunk": { + "type": "object", + "required": [ + "id", + "object", + "created", + "choices", + "model", + "system_fingerprint" + ], + "properties": { + "choices": { + "type": "array", + "items": { + "$ref": "#/components/schemas/CompletionComplete" + } + }, + "created": { + "type": "integer", + "format": "int64", + "minimum": 0 + }, + "id": { + "type": "string" + }, + "model": { + "type": "string" + }, + "object": { + "type": "string" + }, + "system_fingerprint": { + "type": "string" + } + } + }, + "CompletionRequest": { + "type": "object", + "required": [ + "model", + "prompt" + ], + "properties": { + "frequency_penalty": { + "type": "number", + "format": "float", + "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.", + "example": "1.0", + "nullable": true + }, + "max_tokens": { + "type": "integer", + "format": "int32", + "description": "The maximum number of tokens that can be generated in the chat completion.", + "default": "32", + "nullable": true, + "minimum": 0 + }, + "model": { + "type": "string", + "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.", + "example": "mistralai/Mistral-7B-Instruct-v0.2" + }, + "prompt": { + "type": "string", + "description": "The prompt to generate completions for.", + "example": "What is Deep Learning?" + }, + "repetition_penalty": { + "type": "number", + "format": "float", + "nullable": true + }, + "seed": { + "type": "integer", + "format": "int64", + "example": 42, + "nullable": true, + "minimum": 0 + }, + "stream": { + "type": "boolean" + }, + "suffix": { + "type": "string", + "description": "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template.", + "nullable": true + }, + "temperature": { + "type": "number", + "format": "float", + "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both.", + "example": 1.0, + "nullable": true + }, + "top_p": { + "type": "number", + "format": "float", + "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.", + "example": 0.95, + "nullable": true + } + } + }, + "DeltaToolCall": { + "type": "object", + "required": [ + "index", + "id", + "type", + "function" + ], + "properties": { + "function": { + "$ref": "#/components/schemas/Function" + }, + "id": { + "type": "string" + }, + "index": { + "type": "integer", + "format": "int32", + "minimum": 0 + }, + "type": { + "type": "string" + } + } + }, "Details": { "type": "object", "required": [ @@ -931,6 +1215,38 @@ ], "example": "Length" }, + "Function": { + "type": "object", + "required": [ + "arguments" + ], + "properties": { + "arguments": { + "type": "string" + }, + "name": { + "type": "string", + "nullable": true + } + } + }, + "FunctionDefinition": { + "type": "object", + "required": [ + "name", + "parameters" + ], + "properties": { + "description": { + "type": "string", + "nullable": true + }, + "name": { + "type": "string" + }, + "parameters": {} + } + }, "GenerateParameters": { "type": "object", "properties": { @@ -1261,13 +1577,13 @@ "Message": { "type": "object", "required": [ - "role", - "content" + "role" ], "properties": { "content": { "type": "string", - "example": "My name is David and I" + "example": "My name is David and I", + "nullable": true }, "name": { "type": "string", @@ -1277,6 +1593,13 @@ "role": { "type": "string", "example": "user" + }, + "tool_calls": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ToolCall" + }, + "nullable": true } } }, @@ -1437,6 +1760,64 @@ "$ref": "#/components/schemas/SimpleToken" } }, + "Tool": { + "type": "object", + "required": [ + "type", + "function" + ], + "properties": { + "function": { + "$ref": "#/components/schemas/FunctionDefinition" + }, + "type": { + "type": "string", + "example": "function" + } + } + }, + "ToolCall": { + "type": "object", + "required": [ + "id", + "type", + "function" + ], + "properties": { + "function": { + "$ref": "#/components/schemas/FunctionDefinition" + }, + "id": { + "type": "integer", + "format": "int32", + "minimum": 0 + }, + "type": { + "type": "string" + } + } + }, + "ToolType": { + "oneOf": [ + { + "type": "object", + "required": [ + "FunctionName" + ], + "properties": { + "FunctionName": { + "type": "string" + } + } + }, + { + "type": "string", + "enum": [ + "OneOf" + ] + } + ] + }, "Usage": { "type": "object", "required": [ diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json index 467b8ce3..543be115 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_no_tools.json @@ -17,7 +17,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.4-native", + "system_fingerprint": "1.4.5-native", "usage": { "completion_tokens": 100, "prompt_tokens": 60, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json index 8bdb7465..728e90a4 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.4-native", + "system_fingerprint": "1.4.5-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json index 5ba297b1..2e0efb86 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_auto.json @@ -31,7 +31,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.4-native", + "system_fingerprint": "1.4.5-native", "usage": { "completion_tokens": 29, "prompt_tokens": 316, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json index 522624bc..91854223 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_choice.json @@ -30,7 +30,7 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.4-native", + "system_fingerprint": "1.4.5-native", "usage": { "completion_tokens": 21, "prompt_tokens": 187, diff --git a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json index c085100d..e0c7aed6 100644 --- a/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json +++ b/integration-tests/models/__snapshots__/test_tools_llama/test_flash_llama_grammar_tools_stream.json @@ -23,5 +23,5 @@ "id": "", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "object": "text_completion", - "system_fingerprint": "1.4.4-native" + "system_fingerprint": "1.4.5-native" } diff --git a/integration-tests/pyproject.toml b/integration-tests/pyproject.toml index cab74c46..ad217072 100644 --- a/integration-tests/pyproject.toml +++ b/integration-tests/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-integration-tests" -version = "1.4.4" +version = "1.4.5" description = "Text Generation Inference integration tests" authors = ["Nicolas Patry "] diff --git a/launcher/src/main.rs b/launcher/src/main.rs index d52e2669..990eade4 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -604,7 +604,7 @@ fn shard_manager( // We read stderr in another thread as it seems that lines() can block in some cases let (err_sender, err_receiver) = mpsc::channel(); thread::spawn(move || { - for line in shard_stderr_reader.lines().flatten() { + for line in shard_stderr_reader.lines().map_while(Result::ok) { err_sender.send(line).unwrap_or(()); } }); @@ -722,7 +722,7 @@ impl TryFrom<&String> for PythonLogMessage { } fn log_lines(lines: Lines) { - for line in lines.flatten() { + for line in lines.map_while(Result::ok) { match PythonLogMessage::try_from(&line) { Ok(log) => log.trace(), Err(_) => tracing::debug!("{line}"), @@ -874,7 +874,7 @@ fn download_convert_model(args: &Args, running: Arc) -> Result<(), L // We read stderr in another thread as it seems that lines() can block in some cases let (err_sender, err_receiver) = mpsc::channel(); thread::spawn(move || { - for line in download_stderr.lines().flatten() { + for line in download_stderr.lines().map_while(Result::ok) { err_sender.send(line).unwrap_or(()); } }); diff --git a/router/src/server.rs b/router/src/server.rs index b328a8e8..eaed3f1c 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -12,7 +12,7 @@ use crate::{ ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionComplete, ChatCompletionDelta, ChatCompletionLogprob, ChatCompletionLogprobs, ChatCompletionTopLogprob, ChatRequest, CompatGenerateRequest, Completion, CompletionComplete, CompletionCompleteChunk, - CompletionRequest, VertexRequest, VertexResponse, + CompletionRequest, DeltaToolCall, Function, Tool, VertexRequest, VertexResponse, }; use crate::{FunctionDefinition, FunctionRef, FunctionsMap, Properties, ToolCall, ToolType, Tools}; use axum::extract::Extension; @@ -1211,6 +1211,12 @@ pub async fn run( ErrorResponse, GrammarType, Usage, + DeltaToolCall, + ToolType, + Tool, + ToolCall, + Function, + FunctionDefinition, ) ), tags( diff --git a/server/pyproject.toml b/server/pyproject.toml index 241c632d..b5e9c3fd 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation-server" -version = "1.4.4" +version = "1.4.5" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "]