feat(router): refactor API and add openAPI schemas (#53)

2023-02-03 12:43:37 +01:00 · 2023-02-03 12:43:37 +01:00 · 20c3c5940c
parent b1482d9048
commit 20c3c5940c
38 changed files with 1355 additions and 496 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -5,6 +5,8 @@ on:
  push:
    branches:
      - 'main'
    tags:
      - 'v*'
  pull_request:
    branches:
      - 'main'
@ -43,6 +45,8 @@ jobs:
            ghcr.io/huggingface/text-generation-inference
            registry.internal.huggingface.tech/api-inference/community/text-generation-inference
          tags: |
            type=semver,pattern={{version}}
            type=semver,pattern={{major}}.{{minor}}
            type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
            type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
      - name: Build and push Docker image
--- a/Cargo.lock
+++ b/Cargo.lock
@ -83,9 +83,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 [[package]]
 name = "axum"
-version = "0.5.17"
+version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "acee9fd5073ab6b045a275b3e709c163dd36c90685219cb21804a147b58dba43"
+checksum = "e5694b64066a2459918d8074c2ce0d5a88f409431994c2356617c8ae0c4721fc"
 dependencies = [
 "async-trait",
 "axum-core",
@ -101,8 +101,10 @@ dependencies = [
 "mime",
 "percent-encoding",
 "pin-project-lite",
 "rustversion",
 "serde",
 "serde_json",
 "serde_path_to_error",
 "serde_urlencoded",
 "sync_wrapper",
 "tokio",
@ -114,9 +116,9 @@ dependencies = [
 [[package]]
 name = "axum-core"
-version = "0.2.9"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37e5939e02c56fecd5c017c37df4238c0a839fa76b7f97acdd7efb804fd181cc"
+checksum = "1cae3e661676ffbacb30f1a824089a8c9150e71017f7e1e38f2aa32009188d34"
 dependencies = [
 "async-trait",
 "bytes",
@ -124,6 +126,7 @@ dependencies = [
 "http",
 "http-body",
 "mime",
 "rustversion",
 "tower-layer",
 "tower-service",
 ]
@ -207,7 +210,7 @@ dependencies = [
 "tar",
 "tempfile",
 "thiserror",
- "zip",
+ "zip 0.5.13",
 "zip-extensions",
 ]
@ -465,6 +468,15 @@ dependencies = [
 "dirs-sys",
 ]
 [[package]]
 name = "dirs"
 version = "4.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059"
 dependencies = [
 "dirs-sys",
 ]
 [[package]]
 name = "dirs-sys"
 version = "0.3.7"
@ -867,6 +879,7 @@ checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
 dependencies = [
 "autocfg",
 "hashbrown",
 "serde",
 ]
 [[package]]
@ -999,9 +1012,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
 [[package]]
 name = "matchit"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb"
+checksum = "b87248edafb776e59e6ee64a79086f65890d3510f2c656c000bf2a7e8a0aea40"
 [[package]]
 name = "memchr"
@ -1024,6 +1037,16 @@ version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
 [[package]]
 name = "mime_guess"
 version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
 dependencies = [
 "mime",
 "unicase",
 ]
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@ -1552,12 +1575,62 @@ dependencies = [
 "winreg",
 ]
 [[package]]
 name = "rust-embed"
 version = "6.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "283ffe2f866869428c92e0d61c2f35dfb4355293cdfdc48f49e895c15f1333d1"
 dependencies = [
 "rust-embed-impl",
 "rust-embed-utils",
 "walkdir",
 ]
 [[package]]
 name = "rust-embed-impl"
 version = "6.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "31ab23d42d71fb9be1b643fe6765d292c5e14d46912d13f3ae2815ca048ea04d"
 dependencies = [
 "proc-macro2",
 "quote",
 "rust-embed-utils",
 "shellexpand",
 "syn",
 "walkdir",
 ]
 [[package]]
 name = "rust-embed-utils"
 version = "7.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c1669d81dfabd1b5f8e2856b8bbe146c6192b0ba22162edc738ac0a5de18f054"
 dependencies = [
 "sha2",
 "walkdir",
 ]
 [[package]]
 name = "rustversion"
 version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5583e89e108996506031660fe09baa5011b9dd0341b89029313006d1fb508d70"
 [[package]]
 name = "ryu"
 version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
 [[package]]
 name = "same-file"
 version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
 dependencies = [
 "winapi-util",
 ]
 [[package]]
 name = "schannel"
 version = "0.1.20"
@ -1628,6 +1701,15 @@ dependencies = [
 "serde",
 ]
 [[package]]
 name = "serde_path_to_error"
 version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "26b04f22b563c91331a10074bda3dd5492e3cc39d56bd557e91c0af42b6c7341"
 dependencies = [
 "serde",
 ]
 [[package]]
 name = "serde_urlencoded"
 version = "0.7.1"
@ -1660,6 +1742,15 @@ dependencies = [
 "lazy_static",
 ]
 [[package]]
 name = "shellexpand"
 version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7ccc8076840c4da029af4f87e4e8daeb0fca6b87bbb02e10cb60b791450e11e4"
 dependencies = [
 "dirs 4.0.0",
 ]
 [[package]]
 name = "signal-hook-registry"
 version = "1.4.0"
@ -1797,7 +1888,7 @@ dependencies = [
 [[package]]
 name = "text-generation-client"
-version = "0.1.0"
+version = "0.2.0"
 dependencies = [
 "futures",
 "prost",
@ -1812,7 +1903,7 @@ dependencies = [
 [[package]]
 name = "text-generation-launcher"
-version = "0.1.0"
+version = "0.2.0"
 dependencies = [
 "clap 4.0.22",
 "ctrlc",
@ -1827,7 +1918,7 @@ dependencies = [
 [[package]]
 name = "text-generation-router"
-version = "0.1.0"
+version = "0.2.0"
 dependencies = [
 "async-stream",
 "axum",
@ -1845,6 +1936,8 @@ dependencies = [
 "tokio-stream",
 "tracing",
 "tracing-subscriber",
 "utoipa",
 "utoipa-swagger-ui",
 ]
 [[package]]
@ -1921,7 +2014,7 @@ dependencies = [
 "cached-path",
 "clap 2.34.0",
 "derive_builder",
- "dirs",
+ "dirs 3.0.2",
 "esaxx-rs",
 "getrandom",
 "indicatif 0.15.0",
@ -2234,6 +2327,15 @@ version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dcf81ac59edc17cc8697ff311e8f5ef2d99fcbd9817b34cec66f90b6c3dfd987"
 [[package]]
 name = "unicase"
 version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
 dependencies = [
 "version_check",
 ]
 [[package]]
 name = "unicode-bidi"
 version = "0.3.8"
@ -2293,6 +2395,46 @@ dependencies = [
 "percent-encoding",
 ]
 [[package]]
 name = "utoipa"
 version = "3.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3920fa753064b1be7842bea26175ffa0dfc4a8f30bcb52b8ff03fddf8889914c"
 dependencies = [
 "indexmap",
 "serde",
 "serde_json",
 "utoipa-gen",
 ]
 [[package]]
 name = "utoipa-gen"
 version = "3.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "720298fac6efca20df9e457e67a1eab41a20d1c3101380b5c4dca1ca60ae0062"
 dependencies = [
 "proc-macro-error",
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "utoipa-swagger-ui"
 version = "3.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ae3d4f4da6408f0f20ff58196ed619c94306ab32635aeca3d3fa0768c0bd0de2"
 dependencies = [
 "axum",
 "mime_guess",
 "regex",
 "rust-embed",
 "serde",
 "serde_json",
 "utoipa",
 "zip 0.6.4",
 ]
 [[package]]
 name = "valuable"
 version = "0.1.0"
@ -2317,6 +2459,17 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 [[package]]
 name = "walkdir"
 version = "2.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
 dependencies = [
 "same-file",
 "winapi",
 "winapi-util",
 ]
 [[package]]
 name = "want"
 version = "0.3.0"
@ -2589,11 +2742,23 @@ dependencies = [
 "time",
 ]
 [[package]]
 name = "zip"
 version = "0.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0445d0fbc924bb93539b4316c11afb121ea39296f99a3c4c9edad09e3658cdef"
 dependencies = [
 "byteorder",
 "crc32fast",
 "crossbeam-utils",
 "flate2",
 ]
 [[package]]
 name = "zip-extensions"
 version = "0.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a64c3c977bc3434ce2d4bcea8ad3c644672de0f2c402b72b9171ca80a8885d14"
 dependencies = [
- "zip",
+ "zip 0.5.13",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,9 +4,6 @@ members = [
    "router/client",
    "launcher"
 ]
 exclude = [
    "server/safetensors",
 ]
 [profile.release]
 debug = 1
--- a/10
+++ b/10
@ -26,21 +26,18 @@ FROM nvidia/cuda:11.8.0-devel-ubuntu22.04
 ENV LANG=C.UTF-8 \
    LC_ALL=C.UTF-8 \
    DEBIAN_FRONTEND=noninteractive \
-    MODEL_BASE_PATH=/data \
+    HUGGINGFACE_HUB_CACHE=/data \
    MODEL_ID=bigscience/bloom-560m \
    QUANTIZE=false \
-    NUM_GPUS=1 \
+    NUM_SHARD=1 \
    SAFETENSORS_FAST_GPU=1 \
    PORT=80 \
    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
    NCCL_ASYNC_ERROR_HANDLING=1 \
    CUDA_HOME=/usr/local/cuda \
    LD_LIBRARY_PATH="/opt/miniconda/envs/text-generation/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH" \
    CONDA_DEFAULT_ENV=text-generation \
    PATH=$PATH:/opt/miniconda/envs/text-generation/bin:/opt/miniconda/bin:/usr/local/cuda/bin
 SHELL ["/bin/bash", "-c"]
 RUN apt-get update && apt-get install -y unzip curl libssl-dev && rm -rf /var/lib/apt/lists/*
 RUN cd ~ && \
@ -71,4 +68,5 @@ COPY --from=router-builder /usr/local/cargo/bin/text-generation-router /usr/loca
 # Install launcher
 COPY --from=launcher-builder /usr/local/cargo/bin/text-generation-launcher /usr/local/bin/text-generation-launcher
-CMD HUGGINGFACE_HUB_CACHE=$MODEL_BASE_PATH text-generation-launcher --num-shard $NUM_GPUS --model-name $MODEL_ID --json-output
+ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
--- a/14
+++ b/14
@ -15,17 +15,23 @@ server-dev:
 router-dev:
 	cd router && cargo run
 integration-tests: install-router install-launcher
 	cargo test
 python-tests:
 	cd server && pytest tests
 run-bloom-560m:
-	text-generation-launcher --model-name bigscience/bloom-560m --num-shard 2
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2
 run-bloom-560m-quantize:
-	text-generation-launcher --model-name bigscience/bloom-560m --num-shard 2 --quantize
+	text-generation-launcher --model-id bigscience/bloom-560m --num-shard 2 --quantize
 download-bloom:
 	text-generation-server download-weights bigscience/bloom
 run-bloom:
-	text-generation-launcher --model-name bigscience/bloom --num-shard 8
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8
 run-bloom-quantize:
-	text-generation-launcher --model-name bigscience/bloom --num-shard 8 --quantize
+	text-generation-launcher --model-id bigscience/bloom --num-shard 8 --quantize
--- a/README.md
+++ b/README.md
@ -1,16 +1,43 @@
 <div align="center">
 # Text Generation Inference
-<div align="center">
+<a href="https://github.com/huggingface/text-generation-inference">
  <img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/huggingface/text-generation-inference?style=social">
 </a>
 <a href="https://github.com/huggingface/text-generation-inference/blob/main/LICENSE">
  <img alt="License" src="https://img.shields.io/github/license/huggingface/text-generation-inference">
 </a>
 <a href="https://huggingface.github.io/text-generation-inference">
  <img alt="Swagger API documentation" src="https://img.shields.io/badge/API-Swagger-informational">
 </a>
 ![architecture](assets/architecture.jpg)
 </div>
-A Rust and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co) 
+A Rust, Python and gRPC server for text generation inference. Used in production at [HuggingFace](https://huggingface.co) 
-to power Bloom, BloomZ and MT0-XXL api-inference widgets.
+to power LLMs api-inference widgets.
 ## Table of contents
 - [Features](#features)
 - [Officially Supported Models](#officially-supported-models)
 - [Get Started](#get-started)
  - [Docker](#docker)
  - [Local Install](#local-install)
  - [OpenAPI](#api-documentation)
  - [CUDA Kernels](#cuda-kernels)
 - [Run BLOOM](#run-bloom)
  - [Download](#download)
  - [Run](#run)
  - [Quantization](#quantization)
 - [Develop](#develop)
 - [Testing](#testing)
 ## Features
 - Token streaming using Server Side Events (SSE)
 - [Dynamic batching of incoming requests](https://github.com/huggingface/text-generation-inference/blob/main/router/src/batcher.rs#L88) for increased total throughput
 - Quantization with [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 - [Safetensors](https://github.com/huggingface/safetensors) weight loading
@ -36,30 +63,63 @@ or
 `AutoModelForSeq2SeqLM.from_pretrained(<model>, device_map="auto")`
-## Load Tests for BLOOM
+## Get started
-See `k6/load_test.js`
+### Docker
-|                                                              | avg       | min          | med       | max        | p(90)     | p(95)     | RPS      |
+The easiest way of getting started is using the official Docker container:
 |--------------------------------------------------------------|-----------|--------------|-----------|------------|-----------|-----------|----------|
 | [Original code](https://github.com/huggingface/transformers_bloom_parallel) | 8.9s      | 1s           | 9.12s     | 16.69s     | 13.7s     | 14.26s    | 5.9      |
 | New batching logic                                           | **5.44s** | **959.53ms** | **5.28s** | **13.12s** | **7.78s** | **8.92s** | **9.08** |
 ## Install
 ```shell
-make install
+model=bigscience/bloom-560m
 num_shard=2
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 docker run --gpus all -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --num-shard $num_shard
 ```
-## Run 
+You can then query the model using either the `/generate` or `/generate_stream` routes:
 ### BLOOM 560-m
 ```shell
 curl 127.0.0.1:8080/generate \
    -X POST \
    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
    -H 'Content-Type: application/json'
 ```
 ```shell
 curl 127.0.0.1:8080/generate_stream \
    -X POST \
    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
    -H 'Content-Type: application/json'
 ```
 To use GPUs, you will need to install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
 ### API documentation
 You can consult the OpenAPI documentation of the `text-generation-inference` REST API using the `/docs` route.
 The Swagger UI is also available at: [https://huggingface.github.io/text-generation-inference](https://huggingface.github.io/text-generation-inference).
 ### Local install
 You can also opt to install `text-generation-inference` locally. You will need to have cargo and Python installed on your
 machine
 ```shell
 BUILD_EXTENSIONS=True make install # Install repository and HF/transformer fork with CUDA kernels
 make run-bloom-560m
 ```
-### BLOOM
+### CUDA Kernels
 The custom CUDA kernels are only tested on NVIDIA A100s. If you have any installation or runtime issues, you can remove 
 the kernels by using the `BUILD_EXTENSIONS=False` environment variable.
 Be aware that the official Docker image has them enabled by default.
 ## Run BLOOM
 ### Download
 First you need to download the weights:
@ -67,29 +127,30 @@ First you need to download the weights:
 make download-bloom
 ```
 ### Run
 ```shell
 make run-bloom # Requires 8xA100 80GB
 ```
 ### Quantization
 You can also quantize the weights with bitsandbytes to reduce the VRAM requirement:
 ```shell
 make run-bloom-quantize # Requires 8xA100 40GB
 ```
 ## Test
 ```shell
 curl 127.0.0.1:3000/generate \
    -v \
    -X POST \
    -d '{"inputs":"Testing API","parameters":{"max_new_tokens":9}}' \
    -H 'Content-Type: application/json'
 ```
 ## Develop
 ```shell
 make server-dev
 make router-dev
 ```
 ## Testing
 ```shell
 make python-tests
 make integration-tests
 ```
--- a/aml/deployment.yaml
+++ b/aml/deployment.yaml
@ -4,9 +4,9 @@ endpoint_name: bloom-inference
 model: azureml:bloom:1
 model_mount_path: /var/azureml-model
 environment_variables:
-  MODEL_BASE_PATH: /var/azureml-model/bloom
+  HUGGINGFACE_HUB_CACHE: /var/azureml-model/bloom
  MODEL_ID: bigscience/bloom
-  NUM_GPUS: 8
+  NUM_SHARD: 8
 environment:
  image: db4c2190dd824d1f950f5d1555fbadf0.azurecr.io/text-generation-inference:0.3.1
  inference_config:
--- a/assets/architecture.jpg
+++ b/assets/architecture.jpg
--- a/docs/index.html
+++ b/docs/index.html
@ -0,0 +1,30 @@
 <html>
    <head>
        <!-- Load the latest Swagger UI code and style from npm using unpkg.com -->
        <script src="https://unpkg.com/swagger-ui-dist@3/swagger-ui-bundle.js"></script>
        <link rel="stylesheet" type="text/css" href="https://unpkg.com/swagger-ui-dist@3/swagger-ui.css"/>
        <title>Text Generation Inference API</title>
    </head>
    <body>
        <div id="swagger-ui"></div> <!-- Div to hold the UI component -->
        <script>
            window.onload = function () {
                // Begin Swagger UI call region
                const ui = SwaggerUIBundle({
                    url: "openapi.json", //Location of Open API spec in the repo
                    dom_id: '#swagger-ui',
                    deepLinking: true,
                    supportedSubmitMethods: [],
                    presets: [
                        SwaggerUIBundle.presets.apis,
                        SwaggerUIBundle.SwaggerUIStandalonePreset
                    ],
                    plugins: [
                        SwaggerUIBundle.plugins.DownloadUrl
                    ],
                })
                window.ui = ui
            }
        </script>
    </body>
 </html>
--- a/docs/openapi.json
+++ b/docs/openapi.json
@ -0,0 +1,446 @@
 {
  "openapi": "3.0.3",
  "info": {
    "title": "Text Generation Inference",
    "description": "Text Generation Webserver",
    "contact": {
      "name": "Olivier Dehaene",
      "email": ""
    },
    "license": {
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
    "version": "0.2.0"
  },
  "paths": {
    "/generate": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "description": "Generate tokens",
        "operationId": "generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/GenerateResponse"
                  }
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        },
        "deprecated": false
      }
    },
    "/generate_stream": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate a stream of token using Server Side Events",
        "description": "Generate a stream of token using Server Side Events",
        "operationId": "generate_stream",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "text/event-stream ": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/StreamResponse"
                  }
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "text/event-stream ": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "text/event-stream ": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "text/event-stream ": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "text/event-stream ": {
                "schema": {
                  "type": "array",
                  "items": {
                    "$ref": "#/components/schemas/ErrorResponse"
                  }
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        },
        "deprecated": false
      }
    }
  },
  "components": {
    "schemas": {
      "Details": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          }
        }
      },
      "ErrorResponse": {
        "type": "object",
        "required": [
          "error"
        ],
        "properties": {
          "error": {
            "type": "string"
          }
        }
      },
      "FinishReason": {
        "type": "string",
        "enum": [
          "length",
          "eos_token",
          "stop_sequence"
        ]
      },
      "GenerateParameters": {
        "type": "object",
        "properties": {
          "details": {
            "type": "boolean",
            "default": "true"
          },
          "do_sample": {
            "type": "boolean",
            "default": "false",
            "example": true
          },
          "max_new_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "20",
            "exclusiveMaximum": 512.0,
            "exclusiveMinimum": 0.0
          },
          "repetition_penalty": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 1.03,
            "nullable": true,
            "exclusiveMinimum": 0.0
          },
          "seed": {
            "type": "integer",
            "format": "int64"
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "example": [
              "photographer"
            ],
            "maxItems": 4
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.5,
            "nullable": true,
            "exclusiveMinimum": 0.0
          },
          "top_k": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 10,
            "nullable": true,
            "exclusiveMinimum": 0.0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1.0,
            "exclusiveMinimum": 0.0
          }
        }
      },
      "GenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          }
        }
      },
      "GenerateResponse": {
        "type": "object",
        "required": [
          "generated_text"
        ],
        "properties": {
          "details": {
            "$ref": "#/components/schemas/Details"
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "StreamDetails": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42
          }
        }
      },
      "StreamResponse": {
        "type": "object",
        "required": [
          "token"
        ],
        "properties": {
          "details": {
            "$ref": "#/components/schemas/StreamDetails"
          },
          "generated_text": {
            "type": "string",
            "default": "null",
            "example": "test",
            "nullable": true
          },
          "token": {
            "$ref": "#/components/schemas/Token"
          }
        }
      },
      "Token": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      }
    }
  },
  "tags": [
    {
      "name": "Text Generation Inference",
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
 }
--- a/launcher/Cargo.toml
+++ b/launcher/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "text-generation-launcher"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 description = "Text Generation Launcher"
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -19,7 +19,7 @@ use subprocess::{Popen, PopenConfig, PopenError, Redirection};
 #[clap(author, version, about, long_about = None)]
 struct Args {
    #[clap(default_value = "bigscience/bloom-560m", long, env)]
-    model_name: String,
+    model_id: String,
    #[clap(long, env)]
    revision: Option<String>,
    #[clap(long, env)]
@ -49,7 +49,7 @@ struct Args {
 fn main() -> ExitCode {
    // Pattern match configuration
    let Args {
-        model_name,
+        model_id,
        revision,
        num_shard,
        quantize,
@ -92,7 +92,7 @@ fn main() -> ExitCode {
    // Start shard processes
    for rank in 0..num_shard {
-        let model_name = model_name.clone();
+        let model_id = model_id.clone();
        let revision = revision.clone();
        let uds_path = shard_uds_path.clone();
        let master_addr = master_addr.clone();
@ -101,7 +101,7 @@ fn main() -> ExitCode {
        let shutdown_sender = shutdown_sender.clone();
        thread::spawn(move || {
            shard_manager(
-                model_name,
+                model_id,
                revision,
                quantize,
                uds_path,
@ -167,7 +167,7 @@ fn main() -> ExitCode {
        "--master-shard-uds-path".to_string(),
        format!("{}-0", shard_uds_path),
        "--tokenizer-name".to_string(),
-        model_name,
+        model_id,
    ];
    if json_output {
@ -256,7 +256,7 @@ enum ShardStatus {
 #[allow(clippy::too_many_arguments)]
 fn shard_manager(
-    model_name: String,
+    model_id: String,
    revision: Option<String>,
    quantize: bool,
    uds_path: String,
@ -278,7 +278,7 @@ fn shard_manager(
    let mut shard_argv = vec![
        "text-generation-server".to_string(),
        "serve".to_string(),
-        model_name,
+        model_id,
        "--uds-path".to_string(),
        uds_path,
        "--logger-level".to_string(),
--- a/launcher/tests/bloom_560m.json
+++ b/launcher/tests/bloom_560m.json
@ -1,123 +1,122 @@
-[
+{
-  {
+  "details": {
-    "details": {
+    "finish_reason": "length",
-      "finish_reason": "length",
+    "generated_tokens": 20,
-      "generated_tokens": 20,
+    "prefill": [
-      "prefill": [
+      {
-        [
+        "id": 10264,
-          10264,
+        "logprob": null,
-          "Test",
+        "text": "Test"
-          null
+      },
-        ],
+      {
-        [
+        "id": 8821,
-          8821,
+        "logprob": -11.894989,
-          " request",
+        "text": " request"
-          -11.895094
+      }
-        ]
+    ],
-      ],
+    "seed": null,
-      "tokens": [
+    "tokens": [
-        [
+      {
-          17,
+        "id": 17,
-          ".",
+        "logprob": -1.8267672,
-          -1.8267941
+        "text": "."
-        ],
+      },
-        [
+      {
-          1587,
+        "id": 1587,
-          "get",
+        "logprob": -2.4674969,
-          -2.4674964
+        "text": "get"
-        ],
+      },
-        [
+      {
-          11,
+        "id": 11,
-          "(",
+        "logprob": -1.906001,
-          -1.9060438
+        "text": "("
-        ],
+      },
-        [
+      {
-          5,
+        "id": 5,
-          "\"",
+        "logprob": -1.2279545,
-          -1.2279553
+        "text": "\""
-        ],
+      },
-        [
+      {
-          4899,
+        "id": 4899,
-          "action",
+        "logprob": -4.170299,
-          -4.170306
+        "text": "action"
-        ],
+      },
-        [
+      {
-          5,
+        "id": 5,
-          "\"",
+        "logprob": -0.32478866,
-          -0.3247902
+        "text": "\""
-        ],
+      },
-        [
+      {
-          12,
+        "id": 12,
-          ")",
+        "logprob": -1.0773665,
-          -1.0773602
+        "text": ")"
-        ],
+      },
-        [
+      {
-          30,
+        "id": 30,
-          ";",
+        "logprob": -0.27640742,
-          -0.27640444
+        "text": ";"
-        ],
+      },
-        [
+      {
-          837,
+        "id": 837,
-          "\n       ",
+        "logprob": -1.6970354,
-          -1.6970599
+        "text": "\n       "
-        ],
+      },
-        [
+      {
-          1320,
+        "id": 1320,
-          " if",
+        "logprob": -1.4495516,
-          -1.4495552
+        "text": " if"
-        ],
+      },
-        [
+      {
-          375,
+        "id": 375,
-          " (",
+        "logprob": -0.23609057,
-          -0.2360998
+        "text": " ("
-        ],
+      },
-        [
+      {
-          4899,
+        "id": 4899,
-          "action",
+        "logprob": -1.1916996,
-          -1.1916926
+        "text": "action"
-        ],
+      },
-        [
+      {
-          3535,
+        "id": 3535,
-          " ==",
+        "logprob": -0.8918753,
-          -0.8918663
+        "text": " =="
-        ],
+      },
-        [
+      {
-          5109,
+        "id": 5109,
-          " null",
+        "logprob": -0.3933342,
-          -0.39334255
+        "text": " null"
-        ],
+      },
-        [
+      {
-          12,
+        "id": 12,
-          ")",
+        "logprob": -0.43212673,
-          -0.4321134
+        "text": ")"
-        ],
+      },
-        [
+      {
-          731,
+        "id": 731,
-          " {",
+        "logprob": -0.17702064,
-          -0.17701954
+        "text": " {"
-        ],
+      },
-        [
+      {
-          1260,
+        "id": 1260,
-          "\n           ",
+        "logprob": -0.07027565,
-          -0.07027287
+        "text": "\n           "
-        ],
+      },
-        [
+      {
-          10519,
+        "id": 10519,
-          " throw",
+        "logprob": -1.3915029,
-          -1.3915133
+        "text": " throw"
-        ],
+      },
-        [
+      {
-          2084,
+        "id": 2084,
-          " new",
+        "logprob": -0.04201372,
-          -0.042013377
+        "text": " new"
-        ],
+      },
-        [
+      {
-          150858,
+        "id": 150858,
-          " RuntimeException",
+        "logprob": -1.7329919,
-          -1.7330077
+        "text": " RuntimeException"
-        ]
+      }
-      ]
+    ]
-    },
+  },
-    "generated_text": ".get(\"action\");\n        if (action == null) {\n            throw new RuntimeException"
+  "generated_text": ".get(\"action\");\n        if (action == null) {\n            throw new RuntimeException"
-  }
+}
 ]
--- a/launcher/tests/integration_tests.rs
+++ b/launcher/tests/integration_tests.rs
@ -9,11 +9,18 @@ use std::thread::sleep;
 use std::time::Duration;
 use subprocess::{Popen, PopenConfig, Redirection};
 #[derive(Deserialize)]
 pub struct Token {
    id: u32,
    text: String,
    logprob: Option<f32>,
 }
 #[derive(Deserialize)]
 struct Details {
    finish_reason: String,
    generated_tokens: u32,
-    tokens: Vec<(u32, String, Option<f32>)>,
+    tokens: Vec<Token>,
 }
 #[derive(Deserialize)]
@ -22,11 +29,11 @@ struct GeneratedText {
    details: Details,
 }
-fn start_launcher(model_name: String, num_shard: usize, port: usize, master_port: usize) -> Popen {
+fn start_launcher(model_id: String, num_shard: usize, port: usize, master_port: usize) -> Popen {
    let argv = vec![
        "text-generation-launcher".to_string(),
-        "--model-name".to_string(),
+        "--model-id".to_string(),
-        model_name.clone(),
+        model_id.clone(),
        "--num-shard".to_string(),
        num_shard.to_string(),
        "--port".to_string(),
@ -68,16 +75,16 @@ fn start_launcher(model_name: String, num_shard: usize, port: usize, master_port
    launcher.terminate().unwrap();
    launcher.wait().unwrap();
-    panic!("failed to launch {}", model_name)
+    panic!("failed to launch {}", model_id)
 }
 fn test_model(
-    model_name: String,
+    model_id: String,
    num_shard: usize,
    port: usize,
    master_port: usize,
 ) -> GeneratedText {
-    let mut launcher = start_launcher(model_name, num_shard, port, master_port);
+    let mut launcher = start_launcher(model_id, num_shard, port, master_port);
    let data = r#"
        {
@ -109,8 +116,8 @@ fn read_json(name: &str) -> GeneratedText {
    let file = File::open(d).unwrap();
    let reader = BufReader::new(file);
-    let mut results: Vec<GeneratedText> = serde_json::from_reader(reader).unwrap();
+    let result: GeneratedText = serde_json::from_reader(reader).unwrap();
-    results.pop().unwrap()
+    result
 }
 fn compare_results(result: GeneratedText, expected: GeneratedText) {
@ -127,13 +134,13 @@ fn compare_results(result: GeneratedText, expected: GeneratedText) {
        .into_iter()
        .zip(expected.details.tokens.into_iter())
    {
-        assert_eq!(token.0, expected_token.0);
+        assert_eq!(token.id, expected_token.id);
-        assert_eq!(token.1, expected_token.1);
+        assert_eq!(token.text, expected_token.text);
-        if let Some(logprob) = token.2 {
+        if let Some(logprob) = token.logprob {
-            let expected_logprob = expected_token.2.unwrap();
+            let expected_logprob = expected_token.logprob.unwrap();
            assert_float_eq!(logprob, expected_logprob, abs <= 0.001);
        } else {
-            assert_eq!(token.2, expected_token.2);
+            assert_eq!(token.logprob, expected_token.logprob);
        }
    }
 }
--- a/launcher/tests/mt0_base.json
+++ b/launcher/tests/mt0_base.json
@ -1,118 +1,117 @@
-[
+{
-  {
+  "details": {
-    "details": {
+    "finish_reason": "length",
-      "finish_reason": "length",
+    "generated_tokens": 20,
-      "generated_tokens": 20,
+    "prefill": [
-      "prefill": [
+      {
-        [
+        "id": 0,
-          0,
+        "logprob": null,
-          "<pad>",
+        "text": "<pad>"
-          null
+      }
-        ]
+    ],
-      ],
+    "seed": null,
-      "tokens": [
+    "tokens": [
-        [
+      {
-          259,
+        "id": 259,
-          "",
+        "logprob": -1.3656927,
-          -1.3656927
+        "text": ""
-        ],
+      },
-        [
+      {
-          215100,
+        "id": 215100,
-          "\"\"\"",
+        "logprob": -2.6551573,
-          -2.6551573
+        "text": "\"\"\""
-        ],
+      },
-        [
+      {
-          46138,
+        "id": 46138,
-          "Test",
+        "logprob": -1.8059857,
-          -1.8059857
+        "text": "Test"
-        ],
+      },
-        [
+      {
-          287,
+        "id": 287,
-          "the",
+        "logprob": -1.2102449,
-          -1.2102449
+        "text": "the"
-        ],
+      },
-        [
+      {
-          259,
+        "id": 259,
-          "",
+        "logprob": -1.6057279,
-          -1.6057279
+        "text": ""
-        ],
+      },
-        [
+      {
-          49076,
+        "id": 49076,
-          "contents",
+        "logprob": -3.6060903,
-          -3.6060903
+        "text": "contents"
-        ],
+      },
-        [
+      {
-          304,
+        "id": 304,
-          "of",
+        "logprob": -0.5270343,
-          -0.5270343
+        "text": "of"
-        ],
+      },
-        [
+      {
-          287,
+        "id": 287,
-          "the",
+        "logprob": -0.62522805,
-          -0.62522805
+        "text": "the"
-        ],
+      },
-        [
+      {
-          259,
+        "id": 259,
-          "",
+        "logprob": -1.4069618,
-          -1.4069618
+        "text": ""
-        ],
+      },
-        [
+      {
-          49076,
+        "id": 49076,
-          "contents",
+        "logprob": -2.621994,
-          -2.621994
+        "text": "contents"
-        ],
+      },
-        [
+      {
-          304,
+        "id": 304,
-          "of",
+        "logprob": -1.3172221,
-          -1.3172221
+        "text": "of"
-        ],
+      },
-        [
+      {
-          287,
+        "id": 287,
-          "the",
+        "logprob": -0.3501925,
-          -0.3501925
+        "text": "the"
-        ],
+      },
-        [
+      {
-          259,
+        "id": 259,
-          "",
+        "logprob": -0.7219573,
-          -0.7219573
+        "text": ""
-        ],
+      },
-        [
+      {
-          49076,
+        "id": 49076,
-          "contents",
+        "logprob": -1.0494149,
-          -1.0494149
+        "text": "contents"
-        ],
+      },
-        [
+      {
-          260,
+        "id": 260,
-          ".",
+        "logprob": -1.0803378,
-          -1.0803378
+        "text": "."
-        ],
+      },
-        [
+      {
-          259,
+        "id": 259,
-          "",
+        "logprob": -0.32933083,
-          -0.32933083
+        "text": ""
-        ],
+      },
-        [
+      {
-          215100,
+        "id": 215100,
-          "\"\"\"",
+        "logprob": -0.11268901,
-          -0.11268901
+        "text": "\"\"\""
-        ],
+      },
-        [
+      {
-          2978,
+        "id": 2978,
-          "test",
+        "logprob": -1.5846587,
-          -1.5846587
+        "text": "test"
-        ],
+      },
-        [
+      {
-          290,
+        "id": 290,
-          "_",
+        "logprob": -0.49796978,
-          -0.49796978
+        "text": "_"
-        ],
+      },
-        [
+      {
-          4125,
+        "id": 4125,
-          "test",
+        "logprob": -2.0026445,
-          -2.0026445
+        "text": "test"
-        ]
+      }
-      ]
+    ]
-    },
+  },
-    "generated_text": "\"\"\"Test the contents of the contents of the contents. \"\"\" test_test"
+  "generated_text": "\"\"\"Test the contents of the contents of the contents. \"\"\" test_test"
-  }
+}
 ]
--- a/proto/generate.proto
+++ b/proto/generate.proto
@ -71,13 +71,19 @@ message Batch {
    uint32 size = 3;
 }
 enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
 }
 message GeneratedText {
    /// Output
    string text = 1;
    /// Number of generated tokens
    uint32 generated_tokens = 2;
    /// Finish reason
-    string finish_reason = 3;
+    FinishReason finish_reason = 3;
    /// Seed
    optional uint64 seed = 4;
 }
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "text-generation-router"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 authors = ["Olivier Dehaene"]
 description = "Text Generation Webserver"
@ -14,7 +14,7 @@ path = "src/main.rs"
 [dependencies]
 async-stream = "0.3.3"
-axum = { version = "0.5.16", features = ["json", "serde_json"] }
+axum = { version = "0.6.4", features = ["json"] }
 text-generation-client = { path = "client" }
 clap = { version = "4.0.15", features = ["derive", "env"] }
 futures = "0.3.24"
@ -29,4 +29,6 @@ tokio = { version = "1.21.1", features = ["rt", "rt-multi-thread", "parking_lot"
 tokio-stream = "0.1.11"
 tracing = "0.1.36"
 tracing-subscriber = { version = "0.3.15", features = ["json"] }
 utoipa = { version = "3.0.1", features = ["axum_extras"] }
 utoipa-swagger-ui = { version = "3.0.2", features = ["axum"] }
--- a/router/client/Cargo.toml
+++ b/router/client/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "text-generation-client"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 [dependencies]
--- a/router/client/src/lib.rs
+++ b/router/client/src/lib.rs
@ -7,8 +7,8 @@ mod sharded_client;
 pub use client::Client;
 pub use pb::generate::v1::{
-    Batch, GeneratedText, Generation, NextTokenChooserParameters, PrefillTokens, Request,
+    Batch, FinishReason, GeneratedText, Generation, NextTokenChooserParameters, PrefillTokens,
-    StoppingCriteriaParameters,
+    Request, StoppingCriteriaParameters,
 };
 pub use sharded_client::ShardedClient;
 use thiserror::Error;
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@ -127,7 +127,7 @@ impl Infer {
                        .into_iter()
                        .zip(tokens.logprobs.into_iter())
                        .zip(tokens.texts.into_iter())
-                        .map(|((id, logprob), text)| Token(id, text, logprob))
+                        .map(|((id, logprob), text)| Token { id, text, logprob })
                        .collect();
                }
                // Push last token
@ -282,11 +282,11 @@ fn send_generations(generations: Vec<Generation>, entries: &mut IntMap<u64, Entr
        }
        // Create last Token
-        let token = Token(
+        let token = Token {
-            generation.token_id,
+            id: generation.token_id,
-            generation.token_text,
+            text: generation.token_text,
-            generation.token_logprob,
+            logprob: generation.token_logprob,
-        );
+        };
        if let Some(generated_text) = generation.generated_text {
            // Remove entry as this is the last message
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@ -1,5 +1,4 @@
 /// Text Generation Inference Webserver
 mod infer;
 mod queue;
 pub mod server;
@ -8,45 +7,55 @@ mod validation;
 use infer::Infer;
 use queue::{Entry, Queue};
 use serde::{Deserialize, Serialize};
 use utoipa::ToSchema;
 use validation::Validation;
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, ToSchema)]
 pub(crate) struct GenerateParameters {
-    #[serde(default = "default_temperature")]
+    #[serde(default)]
-    pub temperature: f32,
+    #[schema(
-    #[serde(default = "default_repetition_penalty")]
+        exclusive_minimum = 0.0,
-    pub repetition_penalty: f32,
+        nullable = true,
-    #[serde(default = "default_top_k")]
+        default = "null",
-    pub top_k: i32,
+        example = 0.5
-    #[serde(default = "default_top_p")]
+    )]
-    pub top_p: f32,
+    pub temperature: Option<f32>,
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        nullable = true,
        default = "null",
        example = 1.03
    )]
    pub repetition_penalty: Option<f32>,
    #[serde(default)]
    #[schema(exclusive_minimum = 0, nullable = true, default = "null", example = 10)]
    pub top_k: Option<i32>,
    #[serde(default)]
    #[schema(
        exclusive_minimum = 0.0,
        maximum = 1.0,
        nullable = true,
        default = "null",
        example = 0.95
    )]
    pub top_p: Option<f32>,
    #[serde(default = "default_do_sample")]
    #[schema(default = "false", example = true)]
    pub do_sample: bool,
    #[serde(default = "default_max_new_tokens")]
    #[schema(exclusive_minimum = 0, exclusive_maximum = 512, default = "20")]
    pub max_new_tokens: u32,
    #[serde(default)]
    #[schema(inline, max_items = 4, example = json!(["photographer"]))]
    pub stop: Vec<String>,
    #[serde(default)]
    #[schema(default = "true")]
    pub details: bool,
    #[serde(default)]
    pub seed: Option<u64>,
 }
 fn default_temperature() -> f32 {
    1.0
 }
 fn default_repetition_penalty() -> f32 {
    1.0
 }
 fn default_top_k() -> i32 {
    0
 }
 fn default_top_p() -> f32 {
    1.0
 }
 fn default_do_sample() -> bool {
    false
 }
@ -57,10 +66,10 @@ fn default_max_new_tokens() -> u32 {
 fn default_parameters() -> GenerateParameters {
    GenerateParameters {
-        temperature: default_temperature(),
+        temperature: None,
-        repetition_penalty: default_repetition_penalty(),
+        repetition_penalty: None,
-        top_k: default_top_k(),
+        top_k: None,
-        top_p: default_top_p(),
+        top_p: None,
        do_sample: default_do_sample(),
        max_new_tokens: default_max_new_tokens(),
        stop: vec![],
@ -69,42 +78,77 @@ fn default_parameters() -> GenerateParameters {
    }
 }
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, ToSchema)]
 pub(crate) struct GenerateRequest {
    #[schema(example = "My name is Olivier and I")]
    pub inputs: String,
    #[serde(default = "default_parameters")]
    pub parameters: GenerateParameters,
 }
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, ToSchema)]
-pub struct Token(u32, String, f32);
+pub struct Token {
    #[schema(example = 0)]
    id: u32,
    #[schema(example = "test")]
    text: String,
    #[schema(nullable = true, example = -0.34)]
    logprob: f32,
 }
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 pub(crate) enum FinishReason {
    #[schema(rename = "length")]
    Length,
    #[serde(rename = "eos_token")]
    #[schema(rename = "eos_token")]
    EndOfSequenceToken,
    #[schema(rename = "stop_sequence")]
    StopSequence,
 }
 #[derive(Serialize, ToSchema)]
 pub(crate) struct Details {
-    pub finish_reason: String,
+    #[schema(example = "length")]
    pub finish_reason: FinishReason,
    #[schema(example = 1)]
    pub generated_tokens: u32,
    #[schema(example = 42)]
    pub seed: Option<u64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill: Option<Vec<Token>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tokens: Option<Vec<Token>>,
 }
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub(crate) struct GenerateResponse {
    #[schema(example = "test")]
    pub generated_text: String,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub details: Option<Details>,
 }
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
-pub(crate) struct StreamResponse {
+pub(crate) struct StreamDetails {
-    pub token: Token,
+    #[schema(example = "length")]
-    pub generated_text: Option<String>,
+    pub finish_reason: FinishReason,
-    pub details: Option<Details>,
+    #[schema(example = 1)]
    pub generated_tokens: u32,
    #[schema(example = 42)]
    pub seed: Option<u64>,
 }
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub(crate) struct StreamResponse {
    pub token: Token,
    #[schema(nullable = true, default = "null", example = "test")]
    pub generated_text: Option<String>,
    #[schema(nullable = true, default = "null")]
    pub details: Option<StreamDetails>,
 }
 #[derive(Serialize, ToSchema)]
 pub(crate) struct ErrorResponse {
    #[schema(inline)]
    pub error: String,
 }
--- a/router/src/server.rs
+++ b/router/src/server.rs
@ -1,8 +1,8 @@
 /// HTTP Server logic
 use crate::infer::{InferError, InferStreamResponse};
 use crate::{
-    Details, ErrorResponse, GenerateParameters, GenerateRequest, GenerateResponse, Infer,
+    Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse,
-    StreamResponse, Validation,
+    Infer, StreamDetails, StreamResponse, Token, Validation,
 };
 use axum::extract::Extension;
 use axum::http::{HeaderMap, StatusCode};
@ -19,6 +19,8 @@ use tokio::signal;
 use tokio::time::Instant;
 use tokio_stream::StreamExt;
 use tracing::instrument;
 use utoipa::OpenApi;
 use utoipa_swagger_ui::SwaggerUi;
 /// Health check method
 #[instrument(skip(infer))]
@ -32,13 +34,13 @@ async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorRe
        .generate(GenerateRequest {
            inputs: "liveness".to_string(),
            parameters: GenerateParameters {
-                temperature: 1.0,
+                temperature: None,
-                repetition_penalty: 1.0,
+                repetition_penalty: None,
-                top_k: 0,
+                top_k: None,
-                top_p: 1.0,
+                top_p: None,
                do_sample: false,
                max_new_tokens: 1,
-                stop: vec![],
+                stop: Vec::new(),
                details: false,
                seed: None,
            },
@ -47,7 +49,24 @@ async fn health(infer: Extension<Infer>) -> Result<(), (StatusCode, Json<ErrorRe
    Ok(())
 }
-/// Generate method
+/// Generate tokens
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/generate",
    request_body = GenerateRequest,
    responses(
        (status = 200, description = "Generated Text", body = [GenerateResponse]),
        (status = 424, description = "Generation Error", body = [ErrorResponse],
            example = json!({"error": "Request failed during generation"})),
        (status = 429, description = "Model is overloaded", body = [ErrorResponse],
            example = json!({"error": "Model is overloaded"})),
        (status = 422, description = "Input validation error", body = [ErrorResponse],
            example = json!({"error": "Input validation error"})),
        (status = 500, description = "Incomplete generation", body = [ErrorResponse],
            example = json!({"error": "Incomplete generation"})),
    )
 )]
 #[instrument(
    skip(infer),
    fields(
@ -76,7 +95,7 @@ async fn generate(
    // Token details
    let details = match details {
        true => Some(Details {
-            finish_reason: response.generated_text.finish_reason,
+            finish_reason: FinishReason::from(response.generated_text.finish_reason),
            generated_tokens: response.generated_text.generated_tokens,
            prefill: Some(response.prefill),
            tokens: Some(response.tokens),
@ -132,7 +151,29 @@ async fn generate(
    Ok((headers, Json(response)))
 }
-/// Generate stream method
+/// Generate a stream of token using Server Side Events
 #[utoipa::path(
    post,
    tag = "Text Generation Inference",
    path = "/generate_stream",
    request_body = GenerateRequest,
    responses(
        (status = 200, description = "Generated Text", body = [StreamResponse],
            content_type="text/event-stream "),
        (status = 424, description = "Generation Error", body = [ErrorResponse],
            example = json!({"error": "Request failed during generation"}),
            content_type="text/event-stream "),
        (status = 429, description = "Model is overloaded", body = [ErrorResponse],
            example = json!({"error": "Model is overloaded"}),
            content_type="text/event-stream "),
        (status = 422, description = "Input validation error", body = [ErrorResponse],
            example = json!({"error": "Input validation error"}),
            content_type="text/event-stream "),
        (status = 500, description = "Incomplete generation", body = [ErrorResponse],
            example = json!({"error": "Incomplete generation"}),
            content_type="text/event-stream "),
    )
 )]
 #[instrument(
    skip(infer),
    fields(
@ -185,11 +226,9 @@ async fn generate_stream(
                                } => {
                                    // Token details
                                    let details = match details {
-                                        true => Some(Details {
+                                        true => Some(StreamDetails {
-                                            finish_reason: generated_text.finish_reason,
+                                            finish_reason: FinishReason::from(generated_text.finish_reason),
                                            generated_tokens: generated_text.generated_tokens,
                                            prefill: None,
                                            tokens: None,
                                            seed: generated_text.seed,
                                        }),
                                        false => None,
@ -265,6 +304,39 @@ pub async fn run(
    validation_workers: usize,
    addr: SocketAddr,
 ) {
    // OpenAPI documentation
    #[derive(OpenApi)]
    #[openapi(
        paths(
            generate,
            generate_stream,
        ),
        components(
            schemas(
                GenerateRequest,
                GenerateParameters,
                Token,
                GenerateResponse,
                Details,
                FinishReason,
                StreamResponse,
                StreamDetails,
                ErrorResponse,
            )
        ),
        tags(
            (name = "Text Generation Inference", description = "Hugging Face Text Generation Inference API")
        ),
        info(
            title = "Text Generation Inference",
            license(
                name = "Apache 2.0",
                url = "https://www.apache.org/licenses/LICENSE-2.0"
            )
        )
    )]
    struct ApiDoc;
    // Create state
    let validation = Validation::new(validation_workers, tokenizer, max_input_length);
    let infer = Infer::new(
@ -277,6 +349,7 @@ pub async fn run(
    // Create router
    let app = Router::new()
        .merge(SwaggerUi::new("/docs").url("/api-doc/openapi.json", ApiDoc::openapi()))
        .route("/", post(generate))
        .route("/generate", post(generate))
        .route("/generate_stream", post(generate_stream))
@ -320,6 +393,17 @@ async fn shutdown_signal() {
    tracing::info!("signal received, starting graceful shutdown");
 }
 impl From<i32> for FinishReason {
    fn from(finish_reason: i32) -> Self {
        let finish_reason = text_generation_client::FinishReason::from_i32(finish_reason).unwrap();
        match finish_reason {
            text_generation_client::FinishReason::Length => FinishReason::Length,
            text_generation_client::FinishReason::EosToken => FinishReason::EndOfSequenceToken,
            text_generation_client::FinishReason::StopSequence => FinishReason::StopSequence,
        }
    }
 }
 /// Convert to Axum supported formats
 impl From<InferError> for (StatusCode, Json<ErrorResponse>) {
    fn from(err: InferError) -> Self {
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@ -110,30 +110,58 @@ fn validate(
    max_input_length: usize,
    rng: &mut ThreadRng,
 ) -> Result<ValidGenerateRequest, ValidationError> {
-    if request.parameters.temperature <= 0.0 {
+    let GenerateParameters {
        temperature,
        repetition_penalty,
        top_k,
        top_p,
        do_sample,
        max_new_tokens,
        stop: stop_sequences,
        seed,
        ..
    } = request.parameters;
    let temperature = temperature.unwrap_or(1.0);
    if temperature <= 0.0 {
        return Err(ValidationError::Temperature);
    }
-    if request.parameters.repetition_penalty <= 0.0 {
+
    let repetition_penalty = repetition_penalty.unwrap_or(1.0);
    if repetition_penalty <= 0.0 {
        return Err(ValidationError::RepetitionPenalty);
    }
-    if request.parameters.top_p <= 0.0 || request.parameters.top_p > 1.0 {
+
    let top_p = top_p.unwrap_or(1.0);
    if top_p <= 0.0 || top_p > 1.0 {
        return Err(ValidationError::TopP);
    }
-    if request.parameters.top_k < 0 {
+
-        return Err(ValidationError::TopK);
+    // Different because the proto default value is 0 while it is not a valid value
-    }
+    // for the user
-    if request.parameters.max_new_tokens > MAX_MAX_NEW_TOKENS {
+    let top_k: u32 = match top_k {
        None => Ok(0),
        Some(top_k) => {
            if top_k <= 0 {
                return Err(ValidationError::TopK);
            }
            Ok(top_k as u32)
        }
    }?;
    if max_new_tokens == 0 || max_new_tokens > MAX_MAX_NEW_TOKENS {
        return Err(ValidationError::MaxNewTokens(MAX_MAX_NEW_TOKENS));
    }
-    if request.parameters.stop.len() > MAX_STOP_SEQUENCES {
+
    if stop_sequences.len() > MAX_STOP_SEQUENCES {
        return Err(ValidationError::StopSequence(
            MAX_STOP_SEQUENCES,
-            request.parameters.stop.len(),
+            stop_sequences.len(),
        ));
    }
    // If seed is None, assign a random one
-    let seed = match request.parameters.seed {
+    let seed = match seed {
        None => rng.gen(),
        Some(seed) => seed,
    };
@ -147,21 +175,10 @@ fn validate(
                Err(ValidationError::InputLength(input_length, max_input_length))
            } else {
                // Return ValidGenerateRequest
                let GenerateParameters {
                    temperature,
                    repetition_penalty,
                    top_k,
                    top_p,
                    do_sample,
                    max_new_tokens,
                    stop: stop_sequences,
                    ..
                } = request.parameters;
                let parameters = NextTokenChooserParameters {
                    temperature,
                    repetition_penalty,
-                    top_k: top_k as u32,
+                    top_k,
                    top_p,
                    do_sample,
                    seed,
@ -206,7 +223,7 @@ pub enum ValidationError {
    TopP,
    #[error("top_k must be strictly positive")]
    TopK,
-    #[error("max_new_tokens must be <= {0}")]
+    #[error("max_new_tokens must be strictly positive and <= {0}")]
    MaxNewTokens(u32),
    #[error("inputs must have less than {1} tokens. Given: {0}")]
    InputLength(usize, usize),
--- a/server/README.md
+++ b/server/README.md
@ -1,6 +1,6 @@
-# BLOOM Inference Python gRPC Server
+# Text Generation Inference Python gRPC Server
-A Python gRPC server for BLOOM Inference
+A Python gRPC server for Text Generation Inference
 ## Install
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -1,7 +1,7 @@
 [tool.poetry]
 name = "text-generation"
-version = "0.1.0"
+version = "0.2.0"
-description = "BLOOM Inference Python gRPC Server"
+description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <olivier@huggingface.co>"]
 [tool.poetry.scripts]
--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@ -140,8 +140,7 @@ def test_causal_lm_generate_token_completion(default_bloom, default_bloom_batch)
    assert len(generations) == 1
    assert (
-        generations[0].generated_text.text
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
        == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert generations[0].request_id == default_bloom_batch.requests[0].id
    assert (
@ -187,8 +186,7 @@ def test_causal_lm_generate_token_completion_multi(
    assert len(generations) == 1
    assert (
-        generations[0].generated_text.text
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
        == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert (
        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
@ -283,8 +281,7 @@ def test_batch_concatenate(
    assert len(generations) == 2
    assert (
-        generations[0].generated_text.text
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
        == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert generations[0].request_id == default_bloom_batch.requests[0].id
    assert (
@ -306,8 +303,7 @@ def test_batch_concatenate(
    assert len(generations) == 1
    assert (
-        generations[0].generated_text.text
+        generations[0].generated_text.text == "TestTestTestTestTestTestTestTestTestTest"
        == "TestTestTestTestTestTestTestTestTestTest"
    )
    assert (
        generations[0].request_id == default_multi_requests_bloom_batch.requests[0].id
--- a/server/tests/test_utils.py
+++ b/server/tests/test_utils.py
@ -9,6 +9,7 @@ from text_generation.utils import (
    StopSequenceCriteria,
    StoppingCriteria,
    LocalEntryNotFoundError,
    FinishReason,
 )
@ -24,13 +25,13 @@ def test_stop_sequence_criteria():
 def test_stopping_criteria():
    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
    assert criteria(65827, "/test") == (False, None)
-    assert criteria(30, ";") == (True, "stop_sequence")
+    assert criteria(30, ";") == (True, FinishReason.FINISH_REASON_STOP_SEQUENCE)
 def test_stopping_criteria_eos():
    criteria = StoppingCriteria(0, [StopSequenceCriteria("/test;")], max_new_tokens=5)
    assert criteria(1, "") == (False, None)
-    assert criteria(0, "") == (True, "eos_token")
+    assert criteria(0, "") == (True, FinishReason.FINISH_REASON_EOS_TOKEN)
 def test_stopping_criteria_max():
@ -39,7 +40,7 @@ def test_stopping_criteria_max():
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (False, None)
    assert criteria(1, "") == (False, None)
-    assert criteria(1, "") == (True, "length")
+    assert criteria(1, "") == (True, FinishReason.FINISH_REASON_LENGTH)
 def test_weight_hub_files():
--- a/server/text_generation/cli.py
+++ b/server/text_generation/cli.py
@ -13,7 +13,7 @@ app = typer.Typer()
@app.command()
 def serve(
-    model_name: str,
+    model_id: str,
    revision: Optional[str] = None,
    sharded: bool = False,
    quantize: bool = False,
@ -46,16 +46,16 @@ def serve(
            os.getenv("MASTER_PORT", None) is not None
        ), "MASTER_PORT must be set when sharded is True"
-    server.serve(model_name, revision, sharded, quantize, uds_path)
+    server.serve(model_id, revision, sharded, quantize, uds_path)
@app.command()
 def download_weights(
-    model_name: str,
+    model_id: str,
    revision: Optional[str] = None,
    extension: str = ".safetensors",
 ):
-    utils.download_weights(model_name, revision, extension)
+    utils.download_weights(model_id, revision, extension)
 if __name__ == "__main__":
--- a/server/text_generation/models/init.py
+++ b/server/text_generation/models/init.py
@ -30,31 +30,31 @@ torch.backends.cudnn.allow_tf32 = True
 def get_model(
-    model_name: str, revision: Optional[str], sharded: bool, quantize: bool
+    model_id: str, revision: Optional[str], sharded: bool, quantize: bool
 ) -> Model:
-    config = AutoConfig.from_pretrained(model_name, revision=revision)
+    config = AutoConfig.from_pretrained(model_id, revision=revision)
    if config.model_type == "bloom":
        if sharded:
-            return BLOOMSharded(model_name, revision, quantize=quantize)
+            return BLOOMSharded(model_id, revision, quantize=quantize)
        else:
-            return BLOOM(model_name, revision, quantize=quantize)
+            return BLOOM(model_id, revision, quantize=quantize)
    elif config.model_type == "gpt_neox":
        if sharded:
-            return GPTNeoxSharded(model_name, revision, quantize=quantize)
+            return GPTNeoxSharded(model_id, revision, quantize=quantize)
        else:
-            return GPTNeox(model_name, revision, quantize=quantize)
+            return GPTNeox(model_id, revision, quantize=quantize)
-    elif model_name.startswith("facebook/galactica"):
+    elif model_id.startswith("facebook/galactica"):
        if sharded:
-            return GalacticaSharded(model_name, revision, quantize=quantize)
+            return GalacticaSharded(model_id, revision, quantize=quantize)
        else:
-            return Galactica(model_name, revision, quantize=quantize)
+            return Galactica(model_id, revision, quantize=quantize)
-    elif "santacoder" in model_name:
+    elif "santacoder" in model_id:
-        return SantaCoder(model_name, revision, quantize)
+        return SantaCoder(model_id, revision, quantize)
    else:
        if sharded:
            raise ValueError("sharded is not supported for AutoModel")
        try:
-            return CausalLM(model_name, revision, quantize=quantize)
+            return CausalLM(model_id, revision, quantize=quantize)
        except Exception:
-            return Seq2SeqLM(model_name, revision, quantize=quantize)
+            return Seq2SeqLM(model_id, revision, quantize=quantize)
--- a/server/text_generation/models/bloom.py
+++ b/server/text_generation/models/bloom.py
@ -57,10 +57,10 @@ class BLOOM(CausalLM):
 class BLOOMSharded(BLOOM):
    def __init__(
-        self, model_name: str, revision: Optional[str] = None, quantize: bool = False
+        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
    ):
-        if not model_name.startswith("bigscience/bloom"):
+        if not model_id.startswith("bigscience/bloom"):
-            raise ValueError(f"Model {model_name} is not supported")
+            raise ValueError(f"Model {model_id} is not supported")
        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
        self.master = self.rank == 0
@ -72,22 +72,20 @@ class BLOOMSharded(BLOOM):
            dtype = torch.float32
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        config = AutoConfig.from_pretrained(
-            model_name, revision=revision, slow_but_exact=False, tp_parallel=True
+            model_id, revision=revision, slow_but_exact=False, tp_parallel=True
        )
        config.pad_token_id = 3
        # Only download weights for small models
-        if self.master and model_name == "bigscience/bloom-560m":
+        if self.master and model_id == "bigscience/bloom-560m":
-            download_weights(model_name, revision=revision, extension=".safetensors")
+            download_weights(model_id, revision=revision, extension=".safetensors")
        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
            model_name, revision=revision, extension=".safetensors"
        )
        if not filenames:
            raise ValueError("No safetensors weights found")
--- a/server/text_generation/models/causal_lm.py
+++ b/server/text_generation/models/causal_lm.py
@ -232,7 +232,7 @@ class CausalLMBatch(Batch):
 class CausalLM(Model):
-    def __init__(self, model_name: str, revision: Optional[str] = None, quantize=False):
+    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
        if torch.cuda.is_available():
            device = torch.device("cuda")
            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
@ -244,10 +244,10 @@ class CausalLM(Model):
            dtype = torch.float32
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
+            model_id,
            revision=revision,
            torch_dtype=dtype,
            device_map="auto" if torch.cuda.is_available() else None,
--- a/server/text_generation/models/galactica.py
+++ b/server/text_generation/models/galactica.py
@ -149,10 +149,10 @@ class Galactica(CausalLM):
 class GalacticaSharded(Galactica):
    def __init__(
-        self, model_name: str, revision: Optional[str] = None, quantize: bool = False
+        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
    ):
-        if not model_name.startswith("facebook/galactica"):
+        if not model_id.startswith("facebook/galactica"):
-            raise ValueError(f"Model {model_name} is not supported")
+            raise ValueError(f"Model {model_id} is not supported")
        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
        self.master = self.rank == 0
@ -164,22 +164,20 @@ class GalacticaSharded(Galactica):
            dtype = torch.float32
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        config = AutoConfig.from_pretrained(
-            model_name, revision=revision, tp_parallel=True
+            model_id, revision=revision, tp_parallel=True
        )
        tokenizer.pad_token_id = config.pad_token_id
        # Only download weights for small models
-        if self.master and model_name == "facebook/galactica-125m":
+        if self.master and model_id == "facebook/galactica-125m":
-            download_weights(model_name, revision=revision, extension=".safetensors")
+            download_weights(model_id, revision=revision, extension=".safetensors")
        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
            model_name, revision=revision, extension=".safetensors"
        )
        if not filenames:
            raise ValueError("No safetensors weights found")
--- a/server/text_generation/models/gpt_neox.py
+++ b/server/text_generation/models/gpt_neox.py
@ -49,7 +49,7 @@ class GPTNeox(CausalLM):
 class GPTNeoxSharded(GPTNeox):
    def __init__(
-        self, model_name: str, revision: Optional[str] = None, quantize: bool = False
+        self, model_id: str, revision: Optional[str] = None, quantize: bool = False
    ):
        self.process_group, self.rank, self.world_size = initialize_torch_distributed()
        self.master = self.rank == 0
@ -61,22 +61,20 @@ class GPTNeoxSharded(GPTNeox):
            dtype = torch.float32
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        tokenizer.pad_token = tokenizer.eos_token
        config = AutoConfig.from_pretrained(
-            model_name, revision=revision, tp_parallel=True
+            model_id, revision=revision, tp_parallel=True
        )
        # Only master download weights
        if self.master:
-            download_weights(model_name, revision=revision, extension=".safetensors")
+            download_weights(model_id, revision=revision, extension=".safetensors")
        torch.distributed.barrier(group=self.process_group)
-        filenames = weight_files(
+        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
            model_name, revision=revision, extension=".safetensors"
        )
        if not filenames:
            raise ValueError("No safetensors weights found")
--- a/server/text_generation/models/santacoder.py
+++ b/server/text_generation/models/santacoder.py
@ -14,7 +14,7 @@ EOD = "<|endoftext|>"
 class SantaCoder(CausalLM):
-    def __init__(self, model_name: str, revision: Optional[str] = None, quantize=False):
+    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
        if torch.cuda.is_available():
            device = torch.device("cuda")
            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
@ -26,7 +26,7 @@ class SantaCoder(CausalLM):
            dtype = torch.float32
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        tokenizer.add_special_tokens(
            {
@ -43,7 +43,7 @@ class SantaCoder(CausalLM):
        self.model = (
            AutoModelForCausalLM.from_pretrained(
-                model_name,
+                model_id,
                revision=revision,
                torch_dtype=dtype,
                load_in_8bit=quantize,
--- a/server/text_generation/models/seq2seq_lm.py
+++ b/server/text_generation/models/seq2seq_lm.py
@ -289,7 +289,7 @@ class Seq2SeqLMBatch(Batch):
 class Seq2SeqLM(Model):
-    def __init__(self, model_name: str, revision: Optional[str] = None, quantize=False):
+    def __init__(self, model_id: str, revision: Optional[str] = None, quantize=False):
        if torch.cuda.is_available():
            device = torch.device("cuda")
            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
@ -301,14 +301,14 @@ class Seq2SeqLM(Model):
            dtype = torch.float32
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
-            model_name,
+            model_id,
            revision=revision,
            torch_dtype=dtype,
            device_map="auto" if torch.cuda.is_available() else None,
            load_in_8bit=quantize,
        ).eval()
        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, revision=revision, padding_side="left"
+            model_id, revision=revision, padding_side="left"
        )
        tokenizer.bos_token_id = self.model.config.decoder_start_token_id
--- a/server/text_generation/models/types.py
+++ b/server/text_generation/models/types.py
@ -7,6 +7,7 @@ from typing import List, Optional
 from transformers import PreTrainedTokenizerBase
 from text_generation.pb import generate_pb2
 from text_generation.pb.generate_pb2 import FinishReason
 class Batch(ABC):
@ -38,7 +39,7 @@ class Batch(ABC):
 class GeneratedText:
    text: str
    generated_tokens: int
-    finish_reason: str
+    finish_reason: FinishReason
    seed: Optional[int]
    def to_pb(self) -> generate_pb2.GeneratedText:
--- a/server/text_generation/server.py
+++ b/server/text_generation/server.py
@ -66,14 +66,14 @@ class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
 def serve(
-    model_name: str,
+    model_id: str,
    revision: Optional[str],
    sharded: bool,
    quantize: bool,
    uds_path: Path,
 ):
    async def serve_inner(
-        model_name: str,
+        model_id: str,
        revision: Optional[str],
        sharded: bool = False,
        quantize: bool = False,
@ -89,7 +89,7 @@ def serve(
            local_url = unix_socket_template.format(uds_path, 0)
            server_urls = [local_url]
-        model = get_model(model_name, revision, sharded, quantize)
+        model = get_model(model_id, revision, sharded, quantize)
        server = aio.server(interceptors=[ExceptionInterceptor()])
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
@ -109,4 +109,4 @@ def serve(
            logger.info("Signal received. Shutting down")
            await server.stop(0)
-    asyncio.run(serve_inner(model_name, revision, sharded, quantize))
+    asyncio.run(serve_inner(model_id, revision, sharded, quantize))
--- a/server/text_generation/utils.py
+++ b/server/text_generation/utils.py
@ -24,9 +24,11 @@ from transformers.generation.logits_process import (
 )
 from text_generation.pb import generate_pb2
 from text_generation.pb.generate_pb2 import FinishReason
 WEIGHTS_CACHE_OVERRIDE = os.getenv("WEIGHTS_CACHE_OVERRIDE", None)
 class Sampling:
    def __init__(self, seed: int, device: str = "cpu"):
        self.generator = torch.Generator(device)
@ -129,15 +131,15 @@ class StoppingCriteria:
    def __call__(self, last_token: int, last_output: str) -> Tuple[bool, Optional[str]]:
        self.current_tokens += 1
        if self.current_tokens >= self.max_new_tokens:
-            return True, "length"
+            return True, FinishReason.FINISH_REASON_LENGTH
        if last_token == self.eos_token_id:
-            return True, "eos_token"
+            return True, FinishReason.FINISH_REASON_EOS_TOKEN
        self.current_output += last_output
        for stop_sequence_criteria in self.stop_sequence_criterias:
            if stop_sequence_criteria(self.current_output):
-                return True, "stop_sequence"
+                return True, FinishReason.FINISH_REASON_STOP_SEQUENCE
        return False, None
@ -180,20 +182,20 @@ def initialize_torch_distributed():
    return torch.distributed.distributed_c10d._get_default_group(), rank, world_size
-def weight_hub_files(model_name, revision=None, extension=".safetensors"):
+def weight_hub_files(model_id, revision=None, extension=".safetensors"):
    """Get the safetensors filenames on the hub"""
    api = HfApi()
-    info = api.model_info(model_name, revision=revision)
+    info = api.model_info(model_id, revision=revision)
    filenames = [s.rfilename for s in info.siblings if s.rfilename.endswith(extension)]
    return filenames
-def try_to_load_from_cache(model_name, revision, filename):
+def try_to_load_from_cache(model_id, revision, filename):
    """Try to load a file from the Hugging Face cache"""
    if revision is None:
        revision = "main"
-    object_id = model_name.replace("/", "--")
+    object_id = model_id.replace("/", "--")
    repo_cache = Path(HUGGINGFACE_HUB_CACHE) / f"models--{object_id}"
    if not repo_cache.is_dir():
@ -228,38 +230,38 @@ def try_to_load_from_cache(model_name, revision, filename):
    return str(cached_file) if cached_file.is_file() else None
-def weight_files(model_name, revision=None, extension=".safetensors"):
+def weight_files(model_id, revision=None, extension=".safetensors"):
    """Get the local safetensors filenames"""
    if WEIGHTS_CACHE_OVERRIDE is not None:
        return list(Path(WEIGHTS_CACHE_OVERRIDE).glob(f"*{extension}"))
-    filenames = weight_hub_files(model_name, revision, extension)
+    filenames = weight_hub_files(model_id, revision, extension)
    files = []
    for filename in filenames:
        cache_file = try_to_load_from_cache(
-            model_name, revision=revision, filename=filename
+            model_id, revision=revision, filename=filename
        )
        if cache_file is None:
            raise LocalEntryNotFoundError(
-                f"File {filename} of model {model_name} not found in "
+                f"File {filename} of model {model_id} not found in "
                f"{os.getenv('HUGGINGFACE_HUB_CACHE', 'the local cache')}. "
-                f"Please run `text-generation-server download-weights {model_name}` first."
+                f"Please run `text-generation-server download-weights {model_id}` first."
            )
        files.append(cache_file)
    return files
-def download_weights(model_name, revision=None, extension=".safetensors"):
+def download_weights(model_id, revision=None, extension=".safetensors"):
    """Download the safetensors files from the hub"""
    if WEIGHTS_CACHE_OVERRIDE is not None:
        return list(Path(WEIGHTS_CACHE_OVERRIDE).glob(f"*{extension}"))
-    filenames = weight_hub_files(model_name, revision, extension)
+    filenames = weight_hub_files(model_id, revision, extension)
    download_function = partial(
        hf_hub_download,
-        repo_id=model_name,
+        repo_id=model_id,
        local_files_only=False,
    )