From 1c19b0934e281f6a09d5adf393f92525b4070918 Mon Sep 17 00:00:00 2001 From: OlivierDehaene Date: Fri, 3 Mar 2023 18:42:20 +0100 Subject: [PATCH] v0.3.2 (#97) --- Cargo.lock | 8 ++++---- Makefile | 2 +- README.md | 31 +++++++++++++++++++++++++++++-- docs/openapi.json | 12 +++++++++++- launcher/Cargo.toml | 2 +- router/Cargo.toml | 2 +- router/client/Cargo.toml | 2 +- router/grpc-metadata/Cargo.toml | 2 +- server/Makefile | 2 +- server/pyproject.toml | 2 +- 10 files changed, 51 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aac1114e..0a358b2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -786,7 +786,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "grpc-metadata" -version = "0.3.1" +version = "0.3.2" dependencies = [ "opentelemetry", "tonic", @@ -2212,7 +2212,7 @@ dependencies = [ [[package]] name = "text-generation-client" -version = "0.3.1" +version = "0.3.2" dependencies = [ "futures", "grpc-metadata", @@ -2229,7 +2229,7 @@ dependencies = [ [[package]] name = "text-generation-launcher" -version = "0.3.1" +version = "0.3.2" dependencies = [ "clap 4.1.8", "ctrlc", @@ -2244,7 +2244,7 @@ dependencies = [ [[package]] name = "text-generation-router" -version = "0.3.1" +version = "0.3.2" dependencies = [ "async-stream", "axum", diff --git a/Makefile b/Makefile index cf7b8675..ade0cdd5 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ run-bloom-560m-quantize: text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize download-bloom: - text-generation-server download-weights bigscience/bloom + HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom run-bloom: text-generation-launcher --model-id bigscience/bloom --num-shard 8 diff --git a/README.md b/README.md index 5234844b..ae185506 100644 --- a/README.md +++ b/README.md @@ -39,15 +39,17 @@ to power LLMs api-inference widgets. ## Features +- Serve the most popular Large Language Models with a simple launcher +- Tensor Parallelism for faster inference on multiple GPUs - Token streaming using Server-Sent Events (SSE) - [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput - Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) - [Safetensors](https://github.com/huggingface/safetensors) weight loading -- 45ms per token generation for BLOOM with 8xA100 80GB +- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226) - Logits warpers (temperature scaling, topk, repetition penalty ...) - Stop sequences - Log probabilities -- Distributed tracing with Open Telemetry +- Production ready (distributed tracing with Open Telemetry, Prometheus metrics) ## Officially supported architectures @@ -58,6 +60,7 @@ to power LLMs api-inference widgets. - [SantaCoder](https://huggingface.co/bigcode/santacoder) - [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b) - [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl) +- [FLAN-UL2](https://huggingface.co/google/flan-ul2) Other architectures are supported on a best effort basis using: @@ -97,6 +100,30 @@ curl 127.0.0.1:8080/generate_stream \ -H 'Content-Type: application/json' ``` +or from Python: + +```python +import requests + +result = requests.post("http://127.0.0.1:8080/generate", json={"inputs":"Testing API","parameters":{"max_new_tokens":9}}) +print(result.json()) +``` + +```shell +pip install sseclient-py +``` + +````python +import sseclient +import requests + +r = requests.post("http://127.0.0.1:8080/generate_stream", stream=True, json={"inputs":"Testing API","parameters":{"max_new_tokens":9}}) +sse_client = sseclient.SSEClient(r) + +for i, event in enumerate(sse_client.events()): + print(i, event.data) +```` + **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). ### API documentation diff --git a/docs/openapi.json b/docs/openapi.json index 969a2575..7ece1722 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -11,7 +11,7 @@ "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, - "version": "0.3.1" + "version": "0.3.2" }, "paths": { "/generate": { @@ -290,6 +290,11 @@ "nullable": true, "exclusiveMinimum": 0.0 }, + "return_full_text": { + "type": "boolean", + "default": "None", + "example": false + }, "seed": { "type": "integer", "format": "int64" @@ -328,6 +333,11 @@ "nullable": true, "maximum": 1.0, "exclusiveMinimum": 0.0 + }, + "watermark": { + "type": "boolean", + "default": "false", + "example": true } } }, diff --git a/launcher/Cargo.toml b/launcher/Cargo.toml index d53a2c4a..4779aa91 100644 --- a/launcher/Cargo.toml +++ b/launcher/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "text-generation-launcher" -version = "0.3.1" +version = "0.3.2" edition = "2021" authors = ["Olivier Dehaene"] description = "Text Generation Launcher" diff --git a/router/Cargo.toml b/router/Cargo.toml index 6e673af0..2da8a9a0 100644 --- a/router/Cargo.toml +++ b/router/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "text-generation-router" -version = "0.3.1" +version = "0.3.2" edition = "2021" authors = ["Olivier Dehaene"] description = "Text Generation Webserver" diff --git a/router/client/Cargo.toml b/router/client/Cargo.toml index b1700328..960e509f 100644 --- a/router/client/Cargo.toml +++ b/router/client/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "text-generation-client" -version = "0.3.1" +version = "0.3.2" edition = "2021" [dependencies] diff --git a/router/grpc-metadata/Cargo.toml b/router/grpc-metadata/Cargo.toml index 246023b5..e779fbbf 100644 --- a/router/grpc-metadata/Cargo.toml +++ b/router/grpc-metadata/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "grpc-metadata" -version = "0.3.1" +version = "0.3.2" edition = "2021" [dependencies] diff --git a/server/Makefile b/server/Makefile index 4a77dbcf..3926cf5d 100644 --- a/server/Makefile +++ b/server/Makefile @@ -1,4 +1,4 @@ -transformers_commit := 712d62e83c28236c7f39af690e7792a54288dbd9 +transformers_commit := 2f87dca1ca3e5663d0637da9bb037a6956e57a5e gen-server: # Compile protos diff --git a/server/pyproject.toml b/server/pyproject.toml index 68a40dcc..21277939 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "text-generation" -version = "0.3.1" +version = "0.3.2" description = "Text Generation Inference Python gRPC Server" authors = ["Olivier Dehaene "]