diff --git a/docs/openapi.json b/docs/openapi.json
index df2d427f..4454259b 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -1,883 +1 @@
-{
-  "openapi": "3.0.3",
-  "info": {
-    "title": "Text Generation Inference",
-    "description": "Text Generation Webserver",
-    "contact": {
-      "name": "Olivier Dehaene"
-    },
-    "license": {
-      "name": "Apache 2.0",
-      "url": "https://www.apache.org/licenses/LICENSE-2.0"
-    },
-    "version": "1.3.4"
-  },
-  "paths": {
-    "/": {
-      "post": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
-        "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
-        "operationId": "compat_generate",
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/CompatGenerateRequest"
-              }
-            }
-          },
-          "required": true
-        },
-        "responses": {
-          "200": {
-            "description": "Generated Text",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/GenerateResponse"
-                }
-              },
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/StreamResponse"
-                }
-              }
-            }
-          },
-          "422": {
-            "description": "Input validation error",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Input validation error"
-                }
-              }
-            }
-          },
-          "424": {
-            "description": "Generation Error",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Request failed during generation"
-                }
-              }
-            }
-          },
-          "429": {
-            "description": "Model is overloaded",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Model is overloaded"
-                }
-              }
-            }
-          },
-          "500": {
-            "description": "Incomplete generation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Incomplete generation"
-                }
-              }
-            }
-          }
-        }
-      }
-    },
-    "/generate": {
-      "post": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Generate tokens",
-        "description": "Generate tokens",
-        "operationId": "generate",
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/GenerateRequest"
-              }
-            }
-          },
-          "required": true
-        },
-        "responses": {
-          "200": {
-            "description": "Generated Text",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/GenerateResponse"
-                }
-              }
-            }
-          },
-          "422": {
-            "description": "Input validation error",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Input validation error"
-                }
-              }
-            }
-          },
-          "424": {
-            "description": "Generation Error",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Request failed during generation"
-                }
-              }
-            }
-          },
-          "429": {
-            "description": "Model is overloaded",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Model is overloaded"
-                }
-              }
-            }
-          },
-          "500": {
-            "description": "Incomplete generation",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Incomplete generation"
-                }
-              }
-            }
-          }
-        }
-      }
-    },
-    "/generate_stream": {
-      "post": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Generate a stream of token using Server-Sent Events",
-        "description": "Generate a stream of token using Server-Sent Events",
-        "operationId": "generate_stream",
-        "requestBody": {
-          "content": {
-            "application/json": {
-              "schema": {
-                "$ref": "#/components/schemas/GenerateRequest"
-              }
-            }
-          },
-          "required": true
-        },
-        "responses": {
-          "200": {
-            "description": "Generated Text",
-            "content": {
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/StreamResponse"
-                }
-              }
-            }
-          },
-          "422": {
-            "description": "Input validation error",
-            "content": {
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Input validation error"
-                }
-              }
-            }
-          },
-          "424": {
-            "description": "Generation Error",
-            "content": {
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Request failed during generation"
-                }
-              }
-            }
-          },
-          "429": {
-            "description": "Model is overloaded",
-            "content": {
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Model is overloaded"
-                }
-              }
-            }
-          },
-          "500": {
-            "description": "Incomplete generation",
-            "content": {
-              "text/event-stream": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "Incomplete generation"
-                }
-              }
-            }
-          }
-        }
-      }
-    },
-    "/health": {
-      "get": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Health check method",
-        "description": "Health check method",
-        "operationId": "health",
-        "responses": {
-          "200": {
-            "description": "Everything is working fine"
-          },
-          "503": {
-            "description": "Text generation inference is down",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/ErrorResponse"
-                },
-                "example": {
-                  "error": "unhealthy",
-                  "error_type": "healthcheck"
-                }
-              }
-            }
-          }
-        }
-      }
-    },
-    "/info": {
-      "get": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Text Generation Inference endpoint info",
-        "description": "Text Generation Inference endpoint info",
-        "operationId": "get_model_info",
-        "responses": {
-          "200": {
-            "description": "Served model info",
-            "content": {
-              "application/json": {
-                "schema": {
-                  "$ref": "#/components/schemas/Info"
-                }
-              }
-            }
-          }
-        }
-      }
-    },
-    "/metrics": {
-      "get": {
-        "tags": [
-          "Text Generation Inference"
-        ],
-        "summary": "Prometheus metrics scrape endpoint",
-        "description": "Prometheus metrics scrape endpoint",
-        "operationId": "metrics",
-        "responses": {
-          "200": {
-            "description": "Prometheus Metrics",
-            "content": {
-              "text/plain": {
-                "schema": {
-                  "type": "string"
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  },
-  "components": {
-    "schemas": {
-      "BestOfSequence": {
-        "type": "object",
-        "required": [
-          "generated_text",
-          "finish_reason",
-          "generated_tokens",
-          "prefill",
-          "tokens"
-        ],
-        "properties": {
-          "finish_reason": {
-            "$ref": "#/components/schemas/FinishReason"
-          },
-          "generated_text": {
-            "type": "string",
-            "example": "test"
-          },
-          "generated_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": 1,
-            "minimum": 0
-          },
-          "prefill": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/PrefillToken"
-            }
-          },
-          "seed": {
-            "type": "integer",
-            "format": "int64",
-            "example": 42,
-            "nullable": true,
-            "minimum": 0
-          },
-          "tokens": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/Token"
-            }
-          },
-          "top_tokens": {
-            "type": "array",
-            "items": {
-              "type": "array",
-              "items": {
-                "$ref": "#/components/schemas/Token"
-              }
-            }
-          }
-        }
-      },
-      "CompatGenerateRequest": {
-        "type": "object",
-        "required": [
-          "inputs"
-        ],
-        "properties": {
-          "inputs": {
-            "type": "string",
-            "example": "My name is Olivier and I"
-          },
-          "parameters": {
-            "$ref": "#/components/schemas/GenerateParameters"
-          },
-          "stream": {
-            "type": "boolean",
-            "default": "false"
-          }
-        }
-      },
-      "Details": {
-        "type": "object",
-        "required": [
-          "finish_reason",
-          "generated_tokens",
-          "prefill",
-          "tokens"
-        ],
-        "properties": {
-          "best_of_sequences": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/BestOfSequence"
-            },
-            "nullable": true
-          },
-          "finish_reason": {
-            "$ref": "#/components/schemas/FinishReason"
-          },
-          "generated_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": 1,
-            "minimum": 0
-          },
-          "prefill": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/PrefillToken"
-            }
-          },
-          "seed": {
-            "type": "integer",
-            "format": "int64",
-            "example": 42,
-            "nullable": true,
-            "minimum": 0
-          },
-          "tokens": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/Token"
-            }
-          },
-          "top_tokens": {
-            "type": "array",
-            "items": {
-              "type": "array",
-              "items": {
-                "$ref": "#/components/schemas/Token"
-              }
-            }
-          }
-        }
-      },
-      "ErrorResponse": {
-        "type": "object",
-        "required": [
-          "error",
-          "error_type"
-        ],
-        "properties": {
-          "error": {
-            "type": "string"
-          },
-          "error_type": {
-            "type": "string"
-          }
-        }
-      },
-      "FinishReason": {
-        "type": "string",
-        "enum": [
-          "length",
-          "eos_token",
-          "stop_sequence"
-        ]
-      },
-      "GenerateParameters": {
-        "type": "object",
-        "properties": {
-          "best_of": {
-            "type": "integer",
-            "default": "null",
-            "example": 1,
-            "nullable": true,
-            "minimum": 0,
-            "exclusiveMinimum": 0
-          },
-          "decoder_input_details": {
-            "type": "boolean",
-            "default": "true"
-          },
-          "details": {
-            "type": "boolean",
-            "default": "true"
-          },
-          "do_sample": {
-            "type": "boolean",
-            "default": "false",
-            "example": true
-          },
-          "max_new_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "default": "20",
-            "example": "20",
-            "nullable": true,
-            "minimum": 0
-          },
-          "repetition_penalty": {
-            "type": "number",
-            "format": "float",
-            "default": "null",
-            "example": 1.03,
-            "nullable": true,
-            "exclusiveMinimum": 0
-          },
-          "return_full_text": {
-            "type": "boolean",
-            "default": "null",
-            "example": false,
-            "nullable": true
-          },
-          "seed": {
-            "type": "integer",
-            "format": "int64",
-            "default": "null",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0,
-            "exclusiveMinimum": 0
-          },
-          "stop": {
-            "type": "array",
-            "items": {
-              "type": "string"
-            },
-            "example": [
-              "photographer"
-            ],
-            "maxItems": 4
-          },
-          "temperature": {
-            "type": "number",
-            "format": "float",
-            "default": "null",
-            "example": 0.5,
-            "nullable": true,
-            "exclusiveMinimum": 0
-          },
-          "top_k": {
-            "type": "integer",
-            "format": "int32",
-            "default": "null",
-            "example": 10,
-            "nullable": true,
-            "exclusiveMinimum": 0
-          },
-          "top_n_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "default": "null",
-            "example": 5,
-            "nullable": true,
-            "minimum": 0,
-            "exclusiveMinimum": 0
-          },
-          "top_p": {
-            "type": "number",
-            "format": "float",
-            "default": "null",
-            "example": 0.95,
-            "nullable": true,
-            "maximum": 1,
-            "exclusiveMinimum": 0
-          },
-          "truncate": {
-            "type": "integer",
-            "default": "null",
-            "example": "null",
-            "nullable": true,
-            "minimum": 0
-          },
-          "typical_p": {
-            "type": "number",
-            "format": "float",
-            "default": "null",
-            "example": 0.95,
-            "nullable": true,
-            "maximum": 1,
-            "exclusiveMinimum": 0
-          },
-          "watermark": {
-            "type": "boolean",
-            "default": "false",
-            "example": true
-          }
-        }
-      },
-      "GenerateRequest": {
-        "type": "object",
-        "required": [
-          "inputs"
-        ],
-        "properties": {
-          "inputs": {
-            "type": "string",
-            "example": "My name is Olivier and I"
-          },
-          "parameters": {
-            "$ref": "#/components/schemas/GenerateParameters"
-          }
-        }
-      },
-      "GenerateResponse": {
-        "type": "object",
-        "required": [
-          "generated_text"
-        ],
-        "properties": {
-          "details": {
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/Details"
-              }
-            ],
-            "nullable": true
-          },
-          "generated_text": {
-            "type": "string",
-            "example": "test"
-          }
-        }
-      },
-      "Info": {
-        "type": "object",
-        "required": [
-          "model_id",
-          "model_dtype",
-          "model_device_type",
-          "max_concurrent_requests",
-          "max_best_of",
-          "max_stop_sequences",
-          "max_input_length",
-          "max_total_tokens",
-          "waiting_served_ratio",
-          "max_batch_total_tokens",
-          "max_waiting_tokens",
-          "validation_workers",
-          "version"
-        ],
-        "properties": {
-          "docker_label": {
-            "type": "string",
-            "example": "null",
-            "nullable": true
-          },
-          "max_batch_total_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": "32000",
-            "minimum": 0
-          },
-          "max_best_of": {
-            "type": "integer",
-            "example": "2",
-            "minimum": 0
-          },
-          "max_concurrent_requests": {
-            "type": "integer",
-            "description": "Router Parameters",
-            "example": "128",
-            "minimum": 0
-          },
-          "max_input_length": {
-            "type": "integer",
-            "example": "1024",
-            "minimum": 0
-          },
-          "max_stop_sequences": {
-            "type": "integer",
-            "example": "4",
-            "minimum": 0
-          },
-          "max_total_tokens": {
-            "type": "integer",
-            "example": "2048",
-            "minimum": 0
-          },
-          "max_waiting_tokens": {
-            "type": "integer",
-            "example": "20",
-            "minimum": 0
-          },
-          "model_device_type": {
-            "type": "string",
-            "example": "cuda"
-          },
-          "model_dtype": {
-            "type": "string",
-            "example": "torch.float16"
-          },
-          "model_id": {
-            "type": "string",
-            "description": "Model info",
-            "example": "bigscience/blomm-560m"
-          },
-          "model_pipeline_tag": {
-            "type": "string",
-            "example": "text-generation",
-            "nullable": true
-          },
-          "model_sha": {
-            "type": "string",
-            "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
-            "nullable": true
-          },
-          "sha": {
-            "type": "string",
-            "example": "null",
-            "nullable": true
-          },
-          "validation_workers": {
-            "type": "integer",
-            "example": "2",
-            "minimum": 0
-          },
-          "version": {
-            "type": "string",
-            "description": "Router Info",
-            "example": "0.5.0"
-          },
-          "waiting_served_ratio": {
-            "type": "number",
-            "format": "float",
-            "example": "1.2"
-          }
-        }
-      },
-      "PrefillToken": {
-        "type": "object",
-        "required": [
-          "id",
-          "text",
-          "logprob"
-        ],
-        "properties": {
-          "id": {
-            "type": "integer",
-            "format": "int32",
-            "example": 0,
-            "minimum": 0
-          },
-          "logprob": {
-            "type": "number",
-            "format": "float",
-            "example": -0.34,
-            "nullable": true
-          },
-          "text": {
-            "type": "string",
-            "example": "test"
-          }
-        }
-      },
-      "StreamDetails": {
-        "type": "object",
-        "required": [
-          "finish_reason",
-          "generated_tokens"
-        ],
-        "properties": {
-          "finish_reason": {
-            "$ref": "#/components/schemas/FinishReason"
-          },
-          "generated_tokens": {
-            "type": "integer",
-            "format": "int32",
-            "example": 1,
-            "minimum": 0
-          },
-          "seed": {
-            "type": "integer",
-            "format": "int64",
-            "example": 42,
-            "nullable": true,
-            "minimum": 0
-          }
-        }
-      },
-      "StreamResponse": {
-        "type": "object",
-        "required": [
-          "token"
-        ],
-        "properties": {
-          "details": {
-            "allOf": [
-              {
-                "$ref": "#/components/schemas/StreamDetails"
-              }
-            ],
-            "default": "null",
-            "nullable": true
-          },
-          "generated_text": {
-            "type": "string",
-            "default": "null",
-            "example": "test",
-            "nullable": true
-          },
-          "token": {
-            "$ref": "#/components/schemas/Token"
-          },
-          "top_tokens": {
-            "type": "array",
-            "items": {
-              "$ref": "#/components/schemas/Token"
-            }
-          }
-        }
-      },
-      "Token": {
-        "type": "object",
-        "required": [
-          "id",
-          "text",
-          "logprob",
-          "special"
-        ],
-        "properties": {
-          "id": {
-            "type": "integer",
-            "format": "int32",
-            "example": 0,
-            "minimum": 0
-          },
-          "logprob": {
-            "type": "number",
-            "format": "float",
-            "example": -0.34,
-            "nullable": true
-          },
-          "special": {
-            "type": "boolean",
-            "example": "false"
-          },
-          "text": {
-            "type": "string",
-            "example": "test"
-          }
-        }
-      }
-    }
-  },
-  "tags": [
-    {
-      "name": "Text Generation Inference",
-      "description": "Hugging Face Text Generation Inference API"
-    }
-  ]
-}
+{"openapi":"3.0.3","info":{"title":"Text Generation Inference","description":"Text Generation Webserver","contact":{"name":"Olivier Dehaene"},"license":{"name":"Apache 2.0","url":"https://www.apache.org/licenses/LICENSE-2.0"},"version":"1.3.4"},"paths":{"/":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens if `stream == false` or a stream of token if `stream == true`","description":"Generate tokens if `stream == false` or a stream of token if `stream == true`","operationId":"compat_generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CompatGenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}},"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate_stream":{"post":{"tags":["Text Generation Inference"],"summary":"Generate a stream of token using Server-Sent Events","description":"Generate a stream of token using Server-Sent Events","operationId":"generate_stream","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/health":{"get":{"tags":["Text Generation Inference"],"summary":"Health check method","description":"Health check method","operationId":"health","responses":{"200":{"description":"Everything is working fine"},"503":{"description":"Text generation inference is down","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"unhealthy","error_type":"healthcheck"}}}}}}},"/info":{"get":{"tags":["Text Generation Inference"],"summary":"Text Generation Inference endpoint info","description":"Text Generation Inference endpoint info","operationId":"get_model_info","responses":{"200":{"description":"Served model info","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Info"}}}}}}},"/metrics":{"get":{"tags":["Text Generation Inference"],"summary":"Prometheus metrics scrape endpoint","description":"Prometheus metrics scrape endpoint","operationId":"metrics","responses":{"200":{"description":"Prometheus Metrics","content":{"text/plain":{"schema":{"type":"string"}}}}}}},"/tokenize":{"post":{"tags":["Text Generation Inference"],"summary":"Tokenize inputs","description":"Tokenize inputs","operationId":"tokenize","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeRequest"}}},"required":true},"responses":{"200":{"description":"Tokenized ids","content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeResponse"}}}},"404":{"description":"No tokenizer found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"No fast tokenizer available"}}}}}}},"/v1/chat/completions":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"chat_completions","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionChunk"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}}},"components":{"schemas":{"BestOfSequence":{"type":"object","required":["generated_text","finish_reason","generated_tokens","prefill","tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_text":{"type":"string","example":"test"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"CompatGenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"},"stream":{"type":"boolean","default":"false"}}},"Details":{"type":"object","required":["finish_reason","generated_tokens","prefill","tokens"],"properties":{"best_of_sequences":{"type":"array","items":{"$ref":"#/components/schemas/BestOfSequence"},"nullable":true},"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"ErrorResponse":{"type":"object","required":["error","error_type"],"properties":{"error":{"type":"string"},"error_type":{"type":"string"}}},"FinishReason":{"type":"string","enum":["length","eos_token","stop_sequence"]},"GenerateParameters":{"type":"object","properties":{"best_of":{"type":"integer","default":"null","example":1,"nullable":true,"minimum":0,"exclusiveMinimum":0},"decoder_input_details":{"type":"boolean","default":"true"},"details":{"type":"boolean","default":"true"},"do_sample":{"type":"boolean","default":"false","example":true},"max_new_tokens":{"type":"integer","format":"int32","default":"100","example":"20","nullable":true,"minimum":0},"repetition_penalty":{"type":"number","format":"float","default":"null","example":1.03,"nullable":true,"exclusiveMinimum":0},"return_full_text":{"type":"boolean","default":"null","example":false,"nullable":true},"seed":{"type":"integer","format":"int64","default":"null","example":"null","nullable":true,"minimum":0,"exclusiveMinimum":0},"stop":{"type":"array","items":{"type":"string"},"example":["photographer"],"maxItems":4},"temperature":{"type":"number","format":"float","default":"null","example":0.5,"nullable":true,"exclusiveMinimum":0},"top_k":{"type":"integer","format":"int32","default":"null","example":10,"nullable":true,"exclusiveMinimum":0},"top_n_tokens":{"type":"integer","format":"int32","default":"null","example":5,"nullable":true,"minimum":0,"exclusiveMinimum":0},"top_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"truncate":{"type":"integer","default":"null","example":"null","nullable":true,"minimum":0},"typical_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"watermark":{"type":"boolean","default":"false","example":true}}},"GenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"}}},"GenerateResponse":{"type":"object","required":["generated_text"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/Details"}],"nullable":true},"generated_text":{"type":"string","example":"test"}}},"Info":{"type":"object","required":["model_id","model_dtype","model_device_type","max_concurrent_requests","max_best_of","max_stop_sequences","max_input_length","max_total_tokens","waiting_served_ratio","max_batch_total_tokens","max_waiting_tokens","validation_workers","version"],"properties":{"docker_label":{"type":"string","example":"null","nullable":true},"max_batch_total_tokens":{"type":"integer","format":"int32","example":"32000","minimum":0},"max_best_of":{"type":"integer","example":"2","minimum":0},"max_concurrent_requests":{"type":"integer","description":"Router Parameters","example":"128","minimum":0},"max_input_length":{"type":"integer","example":"1024","minimum":0},"max_stop_sequences":{"type":"integer","example":"4","minimum":0},"max_total_tokens":{"type":"integer","example":"2048","minimum":0},"max_waiting_tokens":{"type":"integer","example":"20","minimum":0},"model_device_type":{"type":"string","example":"cuda"},"model_dtype":{"type":"string","example":"torch.float16"},"model_id":{"type":"string","description":"Model info","example":"bigscience/blomm-560m"},"model_pipeline_tag":{"type":"string","example":"text-generation","nullable":true},"model_sha":{"type":"string","example":"e985a63cdc139290c5f700ff1929f0b5942cced2","nullable":true},"sha":{"type":"string","example":"null","nullable":true},"validation_workers":{"type":"integer","example":"2","minimum":0},"version":{"type":"string","description":"Router Info","example":"0.5.0"},"waiting_served_ratio":{"type":"number","format":"float","example":"1.2"}}},"PrefillToken":{"type":"object","required":["id","text","logprob"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"text":{"type":"string","example":"test"}}},"StreamDetails":{"type":"object","required":["finish_reason","generated_tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0}}},"StreamResponse":{"type":"object","required":["index","token"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/StreamDetails"}],"default":"null","nullable":true},"generated_text":{"type":"string","default":"null","example":"test","nullable":true},"index":{"type":"integer","format":"int32","minimum":0},"token":{"$ref":"#/components/schemas/Token"},"top_tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}},"Token":{"type":"object","required":["id","text","logprob","special"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"special":{"type":"boolean","example":"false"},"text":{"type":"string","example":"test"}}}}},"tags":[{"name":"Text Generation Inference","description":"Hugging Face Text Generation Inference API"}]}
diff --git a/router/src/infer.rs b/router/src/infer.rs
index 8a9875eb..5f078ba0 100644
--- a/router/src/infer.rs
+++ b/router/src/infer.rs
@@ -165,6 +165,28 @@ impl Infer {
         ))
     }
 
+    /// Tokenizer the input
+    #[instrument(skip_all)]
+    pub(crate) async fn tokenize(
+        &self,
+        request: GenerateRequest,
+    ) -> Result<Option<tokenizers::Encoding>, InferError> {
+        // Tokenize request
+        let inputs = request.inputs;
+        let truncate = request.parameters.truncate;
+        let encoding = self
+            .validation
+            .tokenize(inputs, truncate)
+            .await
+            .map_err(|err| {
+                tracing::error!("Tokenization {err}");
+                err
+            })?;
+
+        // Return Encoding
+        Ok(encoding.map(|(encoding, _)| encoding))
+    }
+
     /// Apply the chat template to the chat request
     #[instrument(skip_all)]
     pub(crate) fn apply_chat_template(&self, messages: Vec<Message>) -> Result<String, InferError> {
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 894ab466..2bfbbacd 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -444,6 +444,18 @@ pub struct Token {
     special: bool,
 }
 
+#[derive(Debug, Serialize, ToSchema)]
+pub struct SimpleToken {
+    #[schema(example = 0)]
+    id: u32,
+    #[schema(example = "test")]
+    text: String,
+    #[schema(example = 0)]
+    start: usize,
+    #[schema(example = 2)]
+    stop: usize,
+}
+
 #[derive(Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
 pub(crate) enum FinishReason {
diff --git a/router/src/server.rs b/router/src/server.rs
index ff48b4f0..c5ca4665 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -5,8 +5,8 @@ use crate::validation::ValidationError;
 use crate::{
     BestOfSequence, ChatCompletion, ChatCompletionChunk, ChatRequest, CompatGenerateRequest,
     Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse,
-    HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, StreamDetails, StreamResponse,
-    Token, Validation,
+    HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, SimpleToken, StreamDetails,
+    StreamResponse, Token, Validation,
 };
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
@@ -532,7 +532,7 @@ async fn generate_stream_internal(
     path = "/v1/chat/completions",
     request_body = ChatRequest,
     responses(
-    (status = 200, description = "Generated Text", body = GenerateResponse),
+    (status = 200, description = "Generated Text", body = ChatCompletionChunk),
     (status = 424, description = "Generation Error", body = ErrorResponse,
     example = json ! ({"error": "Request failed during generation"})),
     (status = 429, description = "Model is overloaded", body = ErrorResponse,
@@ -672,6 +672,52 @@ async fn chat_completions(
     }
 }
 
+/// Tokenize inputs
+#[utoipa::path(
+    post,
+    tag = "Text Generation Inference",
+    path = "/tokenize",
+    request_body = TokenizeRequest,
+    responses(
+    (status = 200, description = "Tokenized ids", body = TokenizeResponse),
+    (status = 404, description = "No tokenizer found", body = ErrorResponse,
+    example = json ! ({"error": "No fast tokenizer available"})),
+    )
+    )]
+#[instrument(skip_all)]
+async fn tokenize(
+    Extension(infer): Extension<Infer>,
+    Json(req): Json<GenerateRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    let input = req.inputs.clone();
+    let encoding = infer.tokenize(req).await?;
+    if let Some(encoding) = encoding {
+        let tokens: Vec<SimpleToken> = encoding
+            .get_ids()
+            .iter()
+            .zip(encoding.get_offsets())
+            .map(|(&id, &(start, stop))| {
+                let text: String = input.chars().skip(start).take(stop - start).collect();
+                SimpleToken {
+                    id,
+                    text,
+                    start,
+                    stop,
+                }
+            })
+            .collect();
+        Ok(Json(tokens).into_response())
+    } else {
+        Err((
+            StatusCode::NOT_FOUND,
+            Json(ErrorResponse {
+                error: "No fast tokenizer or tokenizer.json for this model".to_string(),
+                error_type: "no fast tokenizer".to_string(),
+            }),
+        ))
+    }
+}
+
 /// Prometheus metrics scrape endpoint
 #[utoipa::path(
 get,
@@ -719,6 +765,8 @@ pub async fn run(
     compat_generate,
     generate,
     generate_stream,
+    chat_completions,
+    tokenize,
     metrics,
     ),
     components(
@@ -867,6 +915,7 @@ pub async fn run(
         .route("/generate", post(generate))
         .route("/generate_stream", post(generate_stream))
         .route("/v1/chat/completions", post(chat_completions))
+        .route("/tokenize", post(tokenize))
         .route("/health", get(health))
         .route("/ping", get(health))
         .route("/metrics", get(metrics));
diff --git a/router/src/validation.rs b/router/src/validation.rs
index 370e9588..750b98e5 100644
--- a/router/src/validation.rs
+++ b/router/src/validation.rs
@@ -70,12 +70,11 @@ impl Validation {
     }
 
     #[instrument(skip(self, inputs))]
-    async fn validate_input(
+    pub async fn tokenize(
         &self,
         inputs: String,
         truncate: Option<usize>,
-        max_new_tokens: Option<u32>,
-    ) -> Result<(String, usize, u32), ValidationError> {
+    ) -> Result<Option<(tokenizers::Encoding, String)>, ValidationError> {
         // If we have a fast tokenizer
         if let Some(sender) = &self.sender {
             // Create response channel
@@ -88,7 +87,24 @@ impl Validation {
 
             // Await on response channel
             // Unwrap is safe here
-            let (inputs, input_length) = response_receiver.await.unwrap()?;
+            let encoding = response_receiver.await.unwrap()?;
+            Ok(Some(encoding))
+        } else {
+            Ok(None)
+        }
+    }
+
+    #[instrument(skip(self, inputs))]
+    async fn validate_input(
+        &self,
+        inputs: String,
+        truncate: Option<usize>,
+        max_new_tokens: Option<u32>,
+    ) -> Result<(String, usize, u32), ValidationError> {
+        // If we have a fast tokenizer
+        if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? {
+            // Create response channel
+            let input_length = encoding.len();
 
             // Get total tokens
             let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens {
@@ -343,36 +359,31 @@ fn tokenizer_worker(tokenizer: Tokenizer, mut receiver: mpsc::UnboundedReceiver<
 
 /// Get input length and optionally truncate it
 fn prepare_input(
-    inputs: String,
+    mut inputs: String,
     truncate: Option<usize>,
     tokenizer: &Tokenizer,
-) -> Result<(String, usize), ValidationError> {
+) -> Result<(tokenizers::Encoding, String), ValidationError> {
     // Get the number of tokens in the input
     let mut encoding = tokenizer
         .encode(inputs.clone(), true)
         .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
 
     // Optionally truncate
-    let (inputs, input_length) = match truncate {
-        // Truncate is some and < encoding length
-        Some(truncate) if truncate < encoding.len() => {
-            // truncate encoding and decode new inputs
+    if let Some(truncate) = truncate {
+        if truncate < encoding.len() {
             encoding.truncate(truncate, 0, TruncationDirection::Left);
-            let inputs = tokenizer
+            inputs = tokenizer
                 .decode(encoding.get_ids(), false)
                 .map_err(|err| ValidationError::Tokenizer(err.to_string()))?;
-            (inputs, encoding.len())
         }
-        // Nothing to do
-        _ => (inputs, encoding.len()),
-    };
+    }
 
-    Ok((inputs, input_length))
+    Ok((encoding, inputs))
 }
 
 type TokenizerRequest = (
     (String, Option<usize>),
-    oneshot::Sender<Result<(String, usize), ValidationError>>,
+    oneshot::Sender<Result<(tokenizers::Encoding, String), ValidationError>>,
     Span,
 );