v0.8.0

2023-05-30 18:39:35 +02:00 · 2023-05-30 18:39:35 +02:00 · 081b926584
parent b8b950b37c
commit 081b926584
6 changed files with 12 additions and 18 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2557,7 +2557,7 @@ dependencies = [

 [[package]]
 name = "text-generation-benchmark"
-version = "0.7.0"
+version = "0.8.0"
 dependencies = [
 "average",
 "clap",
@ -2577,7 +2577,7 @@ dependencies = [

 [[package]]
 name = "text-generation-client"
-version = "0.7.0"
+version = "0.8.0"
 dependencies = [
 "futures",
 "grpc-metadata",
@ -2593,7 +2593,7 @@ dependencies = [

 [[package]]
 name = "text-generation-launcher"
-version = "0.7.0"
+version = "0.8.0"
 dependencies = [
 "clap",
 "ctrlc",
@ -2609,7 +2609,7 @@ dependencies = [

 [[package]]
 name = "text-generation-router"
-version = "0.7.0"
+version = "0.8.0"
 dependencies = [
 "async-stream",
 "axum",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,7 +8,7 @@ members = [
 ]

 [workspace.package]
-version = "0.7.0"
+version = "0.8.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 homepage = "https://github.com/huggingface/text-generation-inference"
--- a/README.md
+++ b/README.md
@ -42,7 +42,7 @@ to power LLMs api-inference widgets.
 - Serve the most popular Large Language Models with a simple launcher
 - Tensor Parallelism for faster inference on multiple GPUs
 - Token streaming using Server-Sent Events (SSE)
- [Continous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
+- [Continuous batching of incoming requests](https://github.com/huggingface/text-generation-inference/tree/main/router) for increased total throughput
 - Optimized transformers code for inference using [flash-attention](https://github.com/HazyResearch/flash-attention) on the most popular architectures
 - Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
@ -61,6 +61,9 @@ to power LLMs api-inference widgets.
 - [Llama](https://github.com/facebookresearch/llama)
 - [OPT](https://huggingface.co/facebook/opt-66b)
 - [SantaCoder](https://huggingface.co/bigcode/santacoder)
+- [Starcoder](https://huggingface.co/bigcode/starcoder)
+- [Falcon 7B](https://huggingface.co/tiiuae/falcon-7b)
+- [Falcon 40B](https://huggingface.co/tiiuae/falcon-40b)

 Other architectures are supported on a best effort basis using:

@ -81,7 +84,7 @@ model=bigscience/bloom-560m
 num_shard=2
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.7 --model-id $model --num-shard $num_shard
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:0.8 --model-id $model --num-shard $num_shard
 ```
 **Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). We also recommend using NVIDIA drivers with CUDA version 11.8 or higher.

--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -10,7 +10,7 @@
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
-    "version": "0.7.0"
+    "version": "0.8.0"
  },
  "paths": {
    "/": {
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "0.7.0"
+version = "0.8.0"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]

--- a/supported_models.json
+++ b/supported_models.json
@ -1,9 +0,0 @@
-[
-  "bigcode/santacoder",
-  "bigscience/bloom",
-  "bigscience/bloomz",
-  "EleutherAI/gpt-neox-20b",
-  "google/flan-ul2",
-  "google/flan-t5-xxl",
-  "OpenAssistant/oasst-sft-1-pythia-12b"
-]