From ebecc06161d3399aa1dace7be1a7a86efec85f8d Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 26 Jan 2024 16:07:31 +0100
Subject: [PATCH] Update the docs to include newer models. (#1492)

---
 docs/openapi.json    |  2 +-
 router/src/lib.rs    | 31 +++++++++++++++++++++++++------
 router/src/server.rs | 22 +++++++++++++++-------
 3 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/docs/openapi.json b/docs/openapi.json
index 4454259b..9a9ed116 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -1 +1 @@
-{"openapi":"3.0.3","info":{"title":"Text Generation Inference","description":"Text Generation Webserver","contact":{"name":"Olivier Dehaene"},"license":{"name":"Apache 2.0","url":"https://www.apache.org/licenses/LICENSE-2.0"},"version":"1.3.4"},"paths":{"/":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens if `stream == false` or a stream of token if `stream == true`","description":"Generate tokens if `stream == false` or a stream of token if `stream == true`","operationId":"compat_generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CompatGenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}},"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate_stream":{"post":{"tags":["Text Generation Inference"],"summary":"Generate a stream of token using Server-Sent Events","description":"Generate a stream of token using Server-Sent Events","operationId":"generate_stream","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/health":{"get":{"tags":["Text Generation Inference"],"summary":"Health check method","description":"Health check method","operationId":"health","responses":{"200":{"description":"Everything is working fine"},"503":{"description":"Text generation inference is down","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"unhealthy","error_type":"healthcheck"}}}}}}},"/info":{"get":{"tags":["Text Generation Inference"],"summary":"Text Generation Inference endpoint info","description":"Text Generation Inference endpoint info","operationId":"get_model_info","responses":{"200":{"description":"Served model info","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Info"}}}}}}},"/metrics":{"get":{"tags":["Text Generation Inference"],"summary":"Prometheus metrics scrape endpoint","description":"Prometheus metrics scrape endpoint","operationId":"metrics","responses":{"200":{"description":"Prometheus Metrics","content":{"text/plain":{"schema":{"type":"string"}}}}}}},"/tokenize":{"post":{"tags":["Text Generation Inference"],"summary":"Tokenize inputs","description":"Tokenize inputs","operationId":"tokenize","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeRequest"}}},"required":true},"responses":{"200":{"description":"Tokenized ids","content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeResponse"}}}},"404":{"description":"No tokenizer found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"No fast tokenizer available"}}}}}}},"/v1/chat/completions":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"chat_completions","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionChunk"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}}},"components":{"schemas":{"BestOfSequence":{"type":"object","required":["generated_text","finish_reason","generated_tokens","prefill","tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_text":{"type":"string","example":"test"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"CompatGenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"},"stream":{"type":"boolean","default":"false"}}},"Details":{"type":"object","required":["finish_reason","generated_tokens","prefill","tokens"],"properties":{"best_of_sequences":{"type":"array","items":{"$ref":"#/components/schemas/BestOfSequence"},"nullable":true},"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"ErrorResponse":{"type":"object","required":["error","error_type"],"properties":{"error":{"type":"string"},"error_type":{"type":"string"}}},"FinishReason":{"type":"string","enum":["length","eos_token","stop_sequence"]},"GenerateParameters":{"type":"object","properties":{"best_of":{"type":"integer","default":"null","example":1,"nullable":true,"minimum":0,"exclusiveMinimum":0},"decoder_input_details":{"type":"boolean","default":"true"},"details":{"type":"boolean","default":"true"},"do_sample":{"type":"boolean","default":"false","example":true},"max_new_tokens":{"type":"integer","format":"int32","default":"100","example":"20","nullable":true,"minimum":0},"repetition_penalty":{"type":"number","format":"float","default":"null","example":1.03,"nullable":true,"exclusiveMinimum":0},"return_full_text":{"type":"boolean","default":"null","example":false,"nullable":true},"seed":{"type":"integer","format":"int64","default":"null","example":"null","nullable":true,"minimum":0,"exclusiveMinimum":0},"stop":{"type":"array","items":{"type":"string"},"example":["photographer"],"maxItems":4},"temperature":{"type":"number","format":"float","default":"null","example":0.5,"nullable":true,"exclusiveMinimum":0},"top_k":{"type":"integer","format":"int32","default":"null","example":10,"nullable":true,"exclusiveMinimum":0},"top_n_tokens":{"type":"integer","format":"int32","default":"null","example":5,"nullable":true,"minimum":0,"exclusiveMinimum":0},"top_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"truncate":{"type":"integer","default":"null","example":"null","nullable":true,"minimum":0},"typical_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"watermark":{"type":"boolean","default":"false","example":true}}},"GenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"}}},"GenerateResponse":{"type":"object","required":["generated_text"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/Details"}],"nullable":true},"generated_text":{"type":"string","example":"test"}}},"Info":{"type":"object","required":["model_id","model_dtype","model_device_type","max_concurrent_requests","max_best_of","max_stop_sequences","max_input_length","max_total_tokens","waiting_served_ratio","max_batch_total_tokens","max_waiting_tokens","validation_workers","version"],"properties":{"docker_label":{"type":"string","example":"null","nullable":true},"max_batch_total_tokens":{"type":"integer","format":"int32","example":"32000","minimum":0},"max_best_of":{"type":"integer","example":"2","minimum":0},"max_concurrent_requests":{"type":"integer","description":"Router Parameters","example":"128","minimum":0},"max_input_length":{"type":"integer","example":"1024","minimum":0},"max_stop_sequences":{"type":"integer","example":"4","minimum":0},"max_total_tokens":{"type":"integer","example":"2048","minimum":0},"max_waiting_tokens":{"type":"integer","example":"20","minimum":0},"model_device_type":{"type":"string","example":"cuda"},"model_dtype":{"type":"string","example":"torch.float16"},"model_id":{"type":"string","description":"Model info","example":"bigscience/blomm-560m"},"model_pipeline_tag":{"type":"string","example":"text-generation","nullable":true},"model_sha":{"type":"string","example":"e985a63cdc139290c5f700ff1929f0b5942cced2","nullable":true},"sha":{"type":"string","example":"null","nullable":true},"validation_workers":{"type":"integer","example":"2","minimum":0},"version":{"type":"string","description":"Router Info","example":"0.5.0"},"waiting_served_ratio":{"type":"number","format":"float","example":"1.2"}}},"PrefillToken":{"type":"object","required":["id","text","logprob"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"text":{"type":"string","example":"test"}}},"StreamDetails":{"type":"object","required":["finish_reason","generated_tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0}}},"StreamResponse":{"type":"object","required":["index","token"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/StreamDetails"}],"default":"null","nullable":true},"generated_text":{"type":"string","default":"null","example":"test","nullable":true},"index":{"type":"integer","format":"int32","minimum":0},"token":{"$ref":"#/components/schemas/Token"},"top_tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}},"Token":{"type":"object","required":["id","text","logprob","special"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"special":{"type":"boolean","example":"false"},"text":{"type":"string","example":"test"}}}}},"tags":[{"name":"Text Generation Inference","description":"Hugging Face Text Generation Inference API"}]}
+{"openapi":"3.0.3","info":{"title":"Text Generation Inference","description":"Text Generation Webserver","contact":{"name":"Olivier Dehaene"},"license":{"name":"Apache 2.0","url":"https://www.apache.org/licenses/LICENSE-2.0"},"version":"1.3.4"},"paths":{"/":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens if `stream == false` or a stream of token if `stream == true`","description":"Generate tokens if `stream == false` or a stream of token if `stream == true`","operationId":"compat_generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CompatGenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}},"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate_stream":{"post":{"tags":["Text Generation Inference"],"summary":"Generate a stream of token using Server-Sent Events","description":"Generate a stream of token using Server-Sent Events","operationId":"generate_stream","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/health":{"get":{"tags":["Text Generation Inference"],"summary":"Health check method","description":"Health check method","operationId":"health","responses":{"200":{"description":"Everything is working fine"},"503":{"description":"Text generation inference is down","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"unhealthy","error_type":"healthcheck"}}}}}}},"/info":{"get":{"tags":["Text Generation Inference"],"summary":"Text Generation Inference endpoint info","description":"Text Generation Inference endpoint info","operationId":"get_model_info","responses":{"200":{"description":"Served model info","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Info"}}}}}}},"/metrics":{"get":{"tags":["Text Generation Inference"],"summary":"Prometheus metrics scrape endpoint","description":"Prometheus metrics scrape endpoint","operationId":"metrics","responses":{"200":{"description":"Prometheus Metrics","content":{"text/plain":{"schema":{"type":"string"}}}}}}},"/tokenize":{"post":{"tags":["Text Generation Inference"],"summary":"Tokenize inputs","description":"Tokenize inputs","operationId":"tokenize","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Tokenized ids","content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeResponse"}}}},"404":{"description":"No tokenizer found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"No fast tokenizer available"}}}}}}},"/v1/chat/completions":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"chat_completions","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionChunk"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}}},"components":{"schemas":{"BestOfSequence":{"type":"object","required":["generated_text","finish_reason","generated_tokens","prefill","tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_text":{"type":"string","example":"test"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"ChatCompletion":{"type":"object","required":["id","object","created","model","system_fingerprint","choices","usage"],"properties":{"choices":{"type":"array","items":{"$ref":"#/components/schemas/ChatCompletionComplete"}},"created":{"type":"integer","format":"int64","example":"1706270835","minimum":0},"id":{"type":"string"},"model":{"type":"string","example":"mistralai/Mistral-7B-Instruct-v0.2"},"object":{"type":"string"},"system_fingerprint":{"type":"string"},"usage":{"$ref":"#/components/schemas/Usage"}}},"ChatCompletionChoice":{"type":"object","required":["index","delta"],"properties":{"delta":{"$ref":"#/components/schemas/ChatCompletionDelta"},"finish_reason":{"type":"string","nullable":true},"index":{"type":"integer","format":"int32","minimum":0},"logprobs":{"type":"number","format":"float","nullable":true}}},"ChatCompletionChunk":{"type":"object","required":["id","object","created","model","system_fingerprint","choices"],"properties":{"choices":{"type":"array","items":{"$ref":"#/components/schemas/ChatCompletionChoice"}},"created":{"type":"integer","format":"int64","example":"1706270978","minimum":0},"id":{"type":"string"},"model":{"type":"string","example":"mistralai/Mistral-7B-Instruct-v0.2"},"object":{"type":"string"},"system_fingerprint":{"type":"string"}}},"ChatCompletionDelta":{"type":"object","required":["role","content"],"properties":{"content":{"type":"string","example":"What is Deep Learning?"},"role":{"type":"string","example":"user"}}},"ChatRequest":{"type":"object","required":["model"],"properties":{"frequency_penalty":{"type":"number","format":"float","description":"Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.","example":"1.0","nullable":true},"logit_bias":{"type":"array","items":{"type":"number","format":"float"},"description":"UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.","nullable":true},"logprobs":{"type":"boolean","description":"Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.","example":"false","nullable":true},"max_tokens":{"type":"integer","format":"int32","description":"The maximum number of tokens that can be generated in the chat completion.","example":"32","nullable":true,"minimum":0},"messages":{"type":"array","items":{"$ref":"#/components/schemas/Message"},"description":"A list of messages comprising the conversation so far."},"model":{"type":"string","description":"UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.","example":"mistralai/Mistral-7B-Instruct-v0.2"},"n":{"type":"integer","format":"int32","description":"UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.","example":"2","nullable":true,"minimum":0},"presence_penalty":{"type":"number","format":"float","description":"UNUSED\nNumber between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics","example":0.1,"nullable":true},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"stream":{"type":"boolean"},"temperature":{"type":"number","format":"float","description":"What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.","example":1.0,"nullable":true},"top_logprobs":{"type":"integer","format":"int32","description":"UNUSED\nAn integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.","example":"5","nullable":true,"minimum":0},"top_p":{"type":"number","format":"float","description":"An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.","example":0.95,"nullable":true}}},"CompatGenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"},"stream":{"type":"boolean","default":"false"}}},"Details":{"type":"object","required":["finish_reason","generated_tokens","prefill","tokens"],"properties":{"best_of_sequences":{"type":"array","items":{"$ref":"#/components/schemas/BestOfSequence"},"nullable":true},"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"ErrorResponse":{"type":"object","required":["error","error_type"],"properties":{"error":{"type":"string"},"error_type":{"type":"string"}}},"FinishReason":{"type":"string","enum":["length","eos_token","stop_sequence"],"example":"Length"},"GenerateParameters":{"type":"object","properties":{"best_of":{"type":"integer","default":"null","example":1,"nullable":true,"minimum":0,"exclusiveMinimum":0},"decoder_input_details":{"type":"boolean","default":"true"},"details":{"type":"boolean","default":"true"},"do_sample":{"type":"boolean","default":"false","example":true},"max_new_tokens":{"type":"integer","format":"int32","default":"100","example":"20","nullable":true,"minimum":0},"repetition_penalty":{"type":"number","format":"float","default":"null","example":1.03,"nullable":true,"exclusiveMinimum":0},"return_full_text":{"type":"boolean","default":"null","example":false,"nullable":true},"seed":{"type":"integer","format":"int64","default":"null","example":"null","nullable":true,"minimum":0,"exclusiveMinimum":0},"stop":{"type":"array","items":{"type":"string"},"example":["photographer"],"maxItems":4},"temperature":{"type":"number","format":"float","default":"null","example":0.5,"nullable":true,"exclusiveMinimum":0},"top_k":{"type":"integer","format":"int32","default":"null","example":10,"nullable":true,"exclusiveMinimum":0},"top_n_tokens":{"type":"integer","format":"int32","default":"null","example":5,"nullable":true,"minimum":0,"exclusiveMinimum":0},"top_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"truncate":{"type":"integer","default":"null","example":"null","nullable":true,"minimum":0},"typical_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"watermark":{"type":"boolean","default":"false","example":true}}},"GenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"}}},"GenerateResponse":{"type":"object","required":["generated_text"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/Details"}],"nullable":true},"generated_text":{"type":"string","example":"test"}}},"Info":{"type":"object","required":["model_id","model_dtype","model_device_type","max_concurrent_requests","max_best_of","max_stop_sequences","max_input_length","max_total_tokens","waiting_served_ratio","max_batch_total_tokens","max_waiting_tokens","validation_workers","version"],"properties":{"docker_label":{"type":"string","example":"null","nullable":true},"max_batch_total_tokens":{"type":"integer","format":"int32","example":"32000","minimum":0},"max_best_of":{"type":"integer","example":"2","minimum":0},"max_concurrent_requests":{"type":"integer","description":"Router Parameters","example":"128","minimum":0},"max_input_length":{"type":"integer","example":"1024","minimum":0},"max_stop_sequences":{"type":"integer","example":"4","minimum":0},"max_total_tokens":{"type":"integer","example":"2048","minimum":0},"max_waiting_tokens":{"type":"integer","example":"20","minimum":0},"model_device_type":{"type":"string","example":"cuda"},"model_dtype":{"type":"string","example":"torch.float16"},"model_id":{"type":"string","description":"Model info","example":"bigscience/blomm-560m"},"model_pipeline_tag":{"type":"string","example":"text-generation","nullable":true},"model_sha":{"type":"string","example":"e985a63cdc139290c5f700ff1929f0b5942cced2","nullable":true},"sha":{"type":"string","example":"null","nullable":true},"validation_workers":{"type":"integer","example":"2","minimum":0},"version":{"type":"string","description":"Router Info","example":"0.5.0"},"waiting_served_ratio":{"type":"number","format":"float","example":"1.2"}}},"Message":{"type":"object","required":["role","content"],"properties":{"content":{"type":"string","example":"My name is David and I"},"role":{"type":"string","example":"user"}}},"PrefillToken":{"type":"object","required":["id","text","logprob"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"text":{"type":"string","example":"test"}}},"SimpleToken":{"type":"object","required":["id","text","start","stop"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"start":{"type":"integer","example":0,"minimum":0},"stop":{"type":"integer","example":2,"minimum":0},"text":{"type":"string","example":"test"}}},"StreamDetails":{"type":"object","required":["finish_reason","generated_tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0}}},"StreamResponse":{"type":"object","required":["index","token"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/StreamDetails"}],"default":"null","nullable":true},"generated_text":{"type":"string","default":"null","example":"test","nullable":true},"index":{"type":"integer","format":"int32","minimum":0},"token":{"$ref":"#/components/schemas/Token"},"top_tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}},"Token":{"type":"object","required":["id","text","logprob","special"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"special":{"type":"boolean","example":"false"},"text":{"type":"string","example":"test"}}},"TokenizeResponse":{"type":"array","items":{"$ref":"#/components/schemas/SimpleToken"}}}},"tags":[{"name":"Text Generation Inference","description":"Hugging Face Text Generation Inference API"}]}
\ No newline at end of file
diff --git a/router/src/lib.rs b/router/src/lib.rs
index 6c16c4b3..fc5670a0 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -188,18 +188,20 @@ fn default_parameters() -> GenerateParameters {
     }
 }
 
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletion {
     pub id: String,
     pub object: String,
+    #[schema(example = "1706270835")]
     pub created: u64,
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
     pub model: String,
     pub system_fingerprint: String,
     pub choices: Vec<ChatCompletionComplete>,
     pub usage: Usage,
 }
 
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionComplete {
     pub index: u32,
     pub message: Message,
@@ -248,17 +250,19 @@ impl ChatCompletion {
     }
 }
 
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionChunk {
     pub id: String,
     pub object: String,
+    #[schema(example = "1706270978")]
     pub created: u64,
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
     pub model: String,
     pub system_fingerprint: String,
     pub choices: Vec<ChatCompletionChoice>,
 }
 
-#[derive(Clone, Deserialize, Serialize)]
+#[derive(Clone, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionChoice {
     pub index: u32,
     pub delta: ChatCompletionDelta,
@@ -266,9 +270,11 @@ pub(crate) struct ChatCompletionChoice {
     pub finish_reason: Option<String>,
 }
 
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, ToSchema)]
 pub(crate) struct ChatCompletionDelta {
+    #[schema(example = "user")]
     pub role: String,
+    #[schema(example = "What is Deep Learning?")]
     pub content: String,
 }
 
@@ -311,7 +317,7 @@ fn default_request_messages() -> Vec<Message> {
 #[derive(Clone, Deserialize, ToSchema, Serialize)]
 pub(crate) struct ChatRequest {
     /// UNUSED
-    #[schema(example = "bigscience/blomm-560m")]
+    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
     /// ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
     pub model: String, /* NOTE: UNUSED */
 
@@ -322,6 +328,7 @@ pub(crate) struct ChatRequest {
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
     /// decreasing the model's likelihood to repeat the same line verbatim.
     #[serde(default)]
+    #[schema(example = "1.0")]
     pub frequency_penalty: Option<f32>,
 
     /// UNUSED
@@ -336,28 +343,33 @@ pub(crate) struct ChatRequest {
     /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
     /// output token returned in the content of message.
     #[serde(default)]
+    #[schema(example = "false")]
     pub logprobs: Option<bool>,
 
     /// UNUSED
     /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
     /// an associated log probability. logprobs must be set to true if this parameter is used.
     #[serde(default)]
+    #[schema(example = "5")]
     pub top_logprobs: Option<u32>,
 
     /// The maximum number of tokens that can be generated in the chat completion.
     #[serde(default)]
+    #[schema(example = "32")]
     pub max_tokens: Option<u32>,
 
     /// UNUSED
     /// How many chat completion choices to generate for each input message. Note that you will be charged based on the
     /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
     #[serde(default)]
+    #[schema(nullable = true, example = "2")]
     pub n: Option<u32>,
 
     /// UNUSED
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
     /// increasing the model's likelihood to talk about new topics
     #[serde(default)]
+    #[schema(nullable = true, example = 0.1)]
     pub presence_penalty: Option<f32>,
 
     #[serde(default = "bool::default")]
@@ -371,11 +383,13 @@ pub(crate) struct ChatRequest {
     ///
     /// We generally recommend altering this or `top_p` but not both.
     #[serde(default)]
+    #[schema(nullable = true, example = 1.0)]
     pub temperature: Option<f32>,
 
     /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
     /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
     #[serde(default)]
+    #[schema(nullable = true, example = 0.95)]
     pub top_p: Option<f32>,
 }
 
@@ -458,6 +472,7 @@ pub struct SimpleToken {
 
 #[derive(Serialize, ToSchema)]
 #[serde(rename_all(serialize = "snake_case"))]
+#[schema(example = "Length")]
 pub(crate) enum FinishReason {
     #[schema(rename = "length")]
     Length,
@@ -518,6 +533,10 @@ pub(crate) struct GenerateResponse {
     pub details: Option<Details>,
 }
 
+#[derive(Serialize, ToSchema)]
+#[serde(transparent)]
+pub(crate) struct TokenizeResponse(Vec<SimpleToken>);
+
 #[derive(Serialize, ToSchema)]
 pub(crate) struct StreamDetails {
     #[schema(example = "length")]
diff --git a/router/src/server.rs b/router/src/server.rs
index c5ca4665..998d6265 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -3,10 +3,10 @@ use crate::health::Health;
 use crate::infer::{InferError, InferResponse, InferStreamResponse};
 use crate::validation::ValidationError;
 use crate::{
-    BestOfSequence, ChatCompletion, ChatCompletionChunk, ChatRequest, CompatGenerateRequest,
-    Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse,
-    HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, SimpleToken, StreamDetails,
-    StreamResponse, Token, Validation,
+    BestOfSequence, ChatCompletion, ChatCompletionChoice, ChatCompletionChunk, ChatCompletionDelta,
+    ChatRequest, CompatGenerateRequest, Details, ErrorResponse, FinishReason, GenerateParameters,
+    GenerateRequest, GenerateResponse, HubModelInfo, HubTokenizerConfig, Infer, Info, Message,
+    PrefillToken, SimpleToken, StreamDetails, StreamResponse, Token, TokenizeResponse, Validation,
 };
 use axum::extract::Extension;
 use axum::http::{HeaderMap, Method, StatusCode};
@@ -677,7 +677,7 @@ async fn chat_completions(
     post,
     tag = "Text Generation Inference",
     path = "/tokenize",
-    request_body = TokenizeRequest,
+    request_body = GenerateRequest,
     responses(
     (status = 200, description = "Tokenized ids", body = TokenizeResponse),
     (status = 404, description = "No tokenizer found", body = ErrorResponse,
@@ -688,7 +688,7 @@ async fn chat_completions(
 async fn tokenize(
     Extension(infer): Extension<Infer>,
     Json(req): Json<GenerateRequest>,
-) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+) -> Result<Json<TokenizeResponse>, (StatusCode, Json<ErrorResponse>)> {
     let input = req.inputs.clone();
     let encoding = infer.tokenize(req).await?;
     if let Some(encoding) = encoding {
@@ -706,7 +706,7 @@ async fn tokenize(
                 }
             })
             .collect();
-        Ok(Json(tokens).into_response())
+        Ok(Json(TokenizeResponse(tokens)))
     } else {
         Err((
             StatusCode::NOT_FOUND,
@@ -774,10 +774,18 @@ pub async fn run(
     Info,
     CompatGenerateRequest,
     GenerateRequest,
+    ChatRequest,
+    Message,
+    ChatCompletionChoice,
+    ChatCompletionDelta,
+    ChatCompletionChunk,
+    ChatCompletion,
     GenerateParameters,
     PrefillToken,
     Token,
     GenerateResponse,
+    TokenizeResponse,
+    SimpleToken,
     BestOfSequence,
     Details,
     FinishReason,