v0.3.2 (#97)
This commit is contained in:
parent
0b6807caa4
commit
1c19b0934e
|
@ -786,7 +786,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "grpc-metadata"
|
name = "grpc-metadata"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"opentelemetry",
|
"opentelemetry",
|
||||||
"tonic",
|
"tonic",
|
||||||
|
@ -2212,7 +2212,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-client"
|
name = "text-generation-client"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"futures",
|
"futures",
|
||||||
"grpc-metadata",
|
"grpc-metadata",
|
||||||
|
@ -2229,7 +2229,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-launcher"
|
name = "text-generation-launcher"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 4.1.8",
|
"clap 4.1.8",
|
||||||
"ctrlc",
|
"ctrlc",
|
||||||
|
@ -2244,7 +2244,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "text-generation-router"
|
name = "text-generation-router"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-stream",
|
"async-stream",
|
||||||
"axum",
|
"axum",
|
||||||
|
|
2
Makefile
2
Makefile
|
@ -28,7 +28,7 @@ run-bloom-560m-quantize:
|
||||||
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
|
text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
|
||||||
|
|
||||||
download-bloom:
|
download-bloom:
|
||||||
text-generation-server download-weights bigscience/bloom
|
HF_HUB_ENABLE_HF_TRANSFER=1 text-generation-server download-weights bigscience/bloom
|
||||||
|
|
||||||
run-bloom:
|
run-bloom:
|
||||||
text-generation-launcher --model-id bigscience/bloom --num-shard 8
|
text-generation-launcher --model-id bigscience/bloom --num-shard 8
|
||||||
|
|
31
README.md
31
README.md
|
@ -39,15 +39,17 @@ to power LLMs api-inference widgets.
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
- Serve the most popular Large Language Models with a simple launcher
|
||||||
|
- Tensor Parallelism for faster inference on multiple GPUs
|
||||||
- Token streaming using Server-Sent Events (SSE)
|
- Token streaming using Server-Sent Events (SSE)
|
||||||
- [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
|
- [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
|
||||||
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
|
- Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
|
||||||
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
|
- [Safetensors](https://github.com/huggingface/safetensors) weight loading
|
||||||
- 45ms per token generation for BLOOM with 8xA100 80GB
|
- Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)
|
||||||
- Logits warpers (temperature scaling, topk, repetition penalty ...)
|
- Logits warpers (temperature scaling, topk, repetition penalty ...)
|
||||||
- Stop sequences
|
- Stop sequences
|
||||||
- Log probabilities
|
- Log probabilities
|
||||||
- Distributed tracing with Open Telemetry
|
- Production ready (distributed tracing with Open Telemetry, Prometheus metrics)
|
||||||
|
|
||||||
## Officially supported architectures
|
## Officially supported architectures
|
||||||
|
|
||||||
|
@ -58,6 +60,7 @@ to power LLMs api-inference widgets.
|
||||||
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
- [SantaCoder](https://huggingface.co/bigcode/santacoder)
|
||||||
- [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
- [GPT-Neox 20B](https://huggingface.co/EleutherAI/gpt-neox-20b)
|
||||||
- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl)
|
- [FLAN-T5-XXL](https://huggingface.co/google/flan-t5-xxl)
|
||||||
|
- [FLAN-UL2](https://huggingface.co/google/flan-ul2)
|
||||||
|
|
||||||
Other architectures are supported on a best effort basis using:
|
Other architectures are supported on a best effort basis using:
|
||||||
|
|
||||||
|
@ -97,6 +100,30 @@ curl 127.0.0.1:8080/generate_stream \
|
||||||
-H 'Content-Type: application/json'
|
-H 'Content-Type: application/json'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
or from Python:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
result = requests.post("http://127.0.0.1:8080/generate", json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
|
||||||
|
print(result.json())
|
||||||
|
```
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install sseclient-py
|
||||||
|
```
|
||||||
|
|
||||||
|
````python
|
||||||
|
import sseclient
|
||||||
|
import requests
|
||||||
|
|
||||||
|
r = requests.post("http://127.0.0.1:8080/generate_stream", stream=True, json={"inputs":"Testing API","parameters":{"max_new_tokens":9}})
|
||||||
|
sse_client = sseclient.SSEClient(r)
|
||||||
|
|
||||||
|
for i, event in enumerate(sse_client.events()):
|
||||||
|
print(i, event.data)
|
||||||
|
````
|
||||||
|
|
||||||
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
|
**Note:** To use GPUs, you need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
|
||||||
|
|
||||||
### API documentation
|
### API documentation
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
"name": "Apache 2.0",
|
"name": "Apache 2.0",
|
||||||
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
||||||
},
|
},
|
||||||
"version": "0.3.1"
|
"version": "0.3.2"
|
||||||
},
|
},
|
||||||
"paths": {
|
"paths": {
|
||||||
"/generate": {
|
"/generate": {
|
||||||
|
@ -290,6 +290,11 @@
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"exclusiveMinimum": 0.0
|
"exclusiveMinimum": 0.0
|
||||||
},
|
},
|
||||||
|
"return_full_text": {
|
||||||
|
"type": "boolean",
|
||||||
|
"default": "None",
|
||||||
|
"example": false
|
||||||
|
},
|
||||||
"seed": {
|
"seed": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"format": "int64"
|
"format": "int64"
|
||||||
|
@ -328,6 +333,11 @@
|
||||||
"nullable": true,
|
"nullable": true,
|
||||||
"maximum": 1.0,
|
"maximum": 1.0,
|
||||||
"exclusiveMinimum": 0.0
|
"exclusiveMinimum": 0.0
|
||||||
|
},
|
||||||
|
"watermark": {
|
||||||
|
"type": "boolean",
|
||||||
|
"default": "false",
|
||||||
|
"example": true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "text-generation-launcher"
|
name = "text-generation-launcher"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
description = "Text Generation Launcher"
|
description = "Text Generation Launcher"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "text-generation-router"
|
name = "text-generation-router"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Olivier Dehaene"]
|
authors = ["Olivier Dehaene"]
|
||||||
description = "Text Generation Webserver"
|
description = "Text Generation Webserver"
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "text-generation-client"
|
name = "text-generation-client"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "grpc-metadata"
|
name = "grpc-metadata"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
transformers_commit := 712d62e83c28236c7f39af690e7792a54288dbd9
|
transformers_commit := 2f87dca1ca3e5663d0637da9bb037a6956e57a5e
|
||||||
|
|
||||||
gen-server:
|
gen-server:
|
||||||
# Compile protos
|
# Compile protos
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "text-generation"
|
name = "text-generation"
|
||||||
version = "0.3.1"
|
version = "0.3.2"
|
||||||
description = "Text Generation Inference Python gRPC Server"
|
description = "Text Generation Inference Python gRPC Server"
|
||||||
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
authors = ["Olivier Dehaene <olivier@huggingface.co>"]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue