{ "openapi": "3.0.3", "info": { "title": "Text Generation Inference", "description": "Text Generation Webserver", "contact": { "name": "Olivier Dehaene" }, "license": { "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, "version": "1.4.2" }, "paths": { "/": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`", "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`", "operationId": "compat_generate", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CompatGenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateResponse" } }, "text/event-stream": { "schema": { "$ref": "#/components/schemas/StreamResponse" } } } }, "422": { "description": "Input validation error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/generate": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate tokens", "description": "Generate tokens", "operationId": "generate", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateResponse" } } } }, "422": { "description": "Input validation error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/generate_stream": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate a stream of token using Server-Sent Events", "description": "Generate a stream of token using Server-Sent Events", "operationId": "generate_stream", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/StreamResponse" } } } }, "422": { "description": "Input validation error", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/health": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Health check method", "description": "Health check method", "operationId": "health", "responses": { "200": { "description": "Everything is working fine" }, "503": { "description": "Text generation inference is down", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "unhealthy", "error_type": "healthcheck" } } } } } } }, "/info": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Text Generation Inference endpoint info", "description": "Text Generation Inference endpoint info", "operationId": "get_model_info", "responses": { "200": { "description": "Served model info", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Info" } } } } } } }, "/metrics": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Prometheus metrics scrape endpoint", "description": "Prometheus metrics scrape endpoint", "operationId": "metrics", "responses": { "200": { "description": "Prometheus Metrics", "content": { "text/plain": { "schema": { "type": "string" } } } } } } }, "/tokenize": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Tokenize inputs", "description": "Tokenize inputs", "operationId": "tokenize", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Tokenized ids", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/TokenizeResponse" } } } }, "404": { "description": "No tokenizer found", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "No fast tokenizer available" } } } } } } }, "/v1/chat/completions": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate tokens", "description": "Generate tokens", "operationId": "chat_completions", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ChatRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ChatCompletionChunk" } } } }, "422": { "description": "Input validation error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } } }, "components": { "schemas": { "BestOfSequence": { "type": "object", "required": [ "generated_text", "finish_reason", "generated_tokens", "prefill", "tokens" ], "properties": { "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_text": { "type": "string", "example": "test" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0 }, "prefill": { "type": "array", "items": { "$ref": "#/components/schemas/PrefillToken" } }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0 }, "tokens": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } }, "top_tokens": { "type": "array", "items": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } } } } }, "ChatCompletion": { "type": "object", "required": [ "id", "object", "created", "model", "system_fingerprint", "choices", "usage" ], "properties": { "choices": { "type": "array", "items": { "$ref": "#/components/schemas/ChatCompletionComplete" } }, "created": { "type": "integer", "format": "int64", "example": "1706270835", "minimum": 0 }, "id": { "type": "string" }, "model": { "type": "string", "example": "mistralai/Mistral-7B-Instruct-v0.2" }, "object": { "type": "string" }, "system_fingerprint": { "type": "string" }, "usage": { "$ref": "#/components/schemas/Usage" } } }, "ChatCompletionChoice": { "type": "object", "required": [ "index", "delta" ], "properties": { "delta": { "$ref": "#/components/schemas/ChatCompletionDelta" }, "finish_reason": { "type": "string", "nullable": true }, "index": { "type": "integer", "format": "int32", "minimum": 0 }, "logprobs": { "allOf": [ { "$ref": "#/components/schemas/ChatCompletionLogprobs" } ], "nullable": true } } }, "ChatCompletionChunk": { "type": "object", "required": [ "id", "object", "created", "model", "system_fingerprint", "choices" ], "properties": { "choices": { "type": "array", "items": { "$ref": "#/components/schemas/ChatCompletionChoice" } }, "created": { "type": "integer", "format": "int64", "example": "1706270978", "minimum": 0 }, "id": { "type": "string" }, "model": { "type": "string", "example": "mistralai/Mistral-7B-Instruct-v0.2" }, "object": { "type": "string" }, "system_fingerprint": { "type": "string" } } }, "ChatCompletionComplete": { "type": "object", "required": [ "index", "message", "finish_reason" ], "properties": { "finish_reason": { "type": "string" }, "index": { "type": "integer", "format": "int32", "minimum": 0 }, "logprobs": { "allOf": [ { "$ref": "#/components/schemas/ChatCompletionLogprobs" } ], "nullable": true }, "message": { "$ref": "#/components/schemas/Message" } } }, "ChatCompletionDelta": { "type": "object", "required": [ "role", "content" ], "properties": { "content": { "type": "string", "example": "What is Deep Learning?" }, "role": { "type": "string", "example": "user" } } }, "ChatCompletionLogprob": { "type": "object", "required": [ "token", "logprob", "top_logprobs" ], "properties": { "logprob": { "type": "number", "format": "float" }, "token": { "type": "string" }, "top_logprobs": { "type": "array", "items": { "$ref": "#/components/schemas/ChatCompletionTopLogprob" } } } }, "ChatCompletionLogprobs": { "type": "object", "required": [ "content" ], "properties": { "content": { "type": "array", "items": { "$ref": "#/components/schemas/ChatCompletionLogprob" } } } }, "ChatCompletionTopLogprob": { "type": "object", "required": [ "token", "logprob" ], "properties": { "logprob": { "type": "number", "format": "float" }, "token": { "type": "string" } } }, "ChatRequest": { "type": "object", "required": [ "model" ], "properties": { "frequency_penalty": { "type": "number", "format": "float", "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.", "example": "1.0", "nullable": true }, "logit_bias": { "type": "array", "items": { "type": "number", "format": "float" }, "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.", "nullable": true }, "logprobs": { "type": "boolean", "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.", "example": "false", "nullable": true }, "max_tokens": { "type": "integer", "format": "int32", "description": "The maximum number of tokens that can be generated in the chat completion.", "example": "32", "nullable": true, "minimum": 0 }, "messages": { "type": "array", "items": { "$ref": "#/components/schemas/Message" }, "description": "A list of messages comprising the conversation so far." }, "model": { "type": "string", "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.", "example": "mistralai/Mistral-7B-Instruct-v0.2" }, "n": { "type": "integer", "format": "int32", "description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.", "example": "2", "nullable": true, "minimum": 0 }, "presence_penalty": { "type": "number", "format": "float", "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics", "example": 0.1, "nullable": true }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0 }, "stream": { "type": "boolean" }, "temperature": { "type": "number", "format": "float", "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.", "example": 1.0, "nullable": true }, "top_logprobs": { "type": "integer", "format": "int32", "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.", "example": "5", "nullable": true, "minimum": 0 }, "top_p": { "type": "number", "format": "float", "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.", "example": 0.95, "nullable": true } } }, "CompatGenerateRequest": { "type": "object", "required": [ "inputs" ], "properties": { "inputs": { "type": "string", "example": "My name is Olivier and I" }, "parameters": { "$ref": "#/components/schemas/GenerateParameters" }, "stream": { "type": "boolean", "default": "false" } } }, "Details": { "type": "object", "required": [ "finish_reason", "generated_tokens", "prefill", "tokens" ], "properties": { "best_of_sequences": { "type": "array", "items": { "$ref": "#/components/schemas/BestOfSequence" }, "nullable": true }, "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0 }, "prefill": { "type": "array", "items": { "$ref": "#/components/schemas/PrefillToken" } }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0 }, "tokens": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } }, "top_tokens": { "type": "array", "items": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } } } } }, "ErrorResponse": { "type": "object", "required": [ "error", "error_type" ], "properties": { "error": { "type": "string" }, "error_type": { "type": "string" } } }, "FinishReason": { "type": "string", "enum": [ "length", "eos_token", "stop_sequence" ], "example": "Length" }, "GenerateParameters": { "type": "object", "properties": { "best_of": { "type": "integer", "default": "null", "example": 1, "nullable": true, "minimum": 0, "exclusiveMinimum": 0 }, "decoder_input_details": { "type": "boolean", "default": "true" }, "details": { "type": "boolean", "default": "true" }, "do_sample": { "type": "boolean", "default": "false", "example": true }, "frequency_penalty": { "type": "number", "format": "float", "default": "null", "example": 0.1, "nullable": true, "exclusiveMinimum": -2 }, "grammar": { "allOf": [ { "$ref": "#/components/schemas/GrammarType" } ], "nullable": true }, "max_new_tokens": { "type": "integer", "format": "int32", "default": "100", "example": "20", "nullable": true, "minimum": 0 }, "repetition_penalty": { "type": "number", "format": "float", "default": "null", "example": 1.03, "nullable": true, "exclusiveMinimum": 0 }, "return_full_text": { "type": "boolean", "default": "null", "example": false, "nullable": true }, "seed": { "type": "integer", "format": "int64", "default": "null", "example": "null", "nullable": true, "minimum": 0, "exclusiveMinimum": 0 }, "stop": { "type": "array", "items": { "type": "string" }, "example": [ "photographer" ], "maxItems": 4 }, "temperature": { "type": "number", "format": "float", "default": "null", "example": 0.5, "nullable": true, "exclusiveMinimum": 0 }, "top_k": { "type": "integer", "format": "int32", "default": "null", "example": 10, "nullable": true, "exclusiveMinimum": 0 }, "top_n_tokens": { "type": "integer", "format": "int32", "default": "null", "example": 5, "nullable": true, "minimum": 0, "exclusiveMinimum": 0 }, "top_p": { "type": "number", "format": "float", "default": "null", "example": 0.95, "nullable": true, "maximum": 1, "exclusiveMinimum": 0 }, "truncate": { "type": "integer", "default": "null", "example": "null", "nullable": true, "minimum": 0 }, "typical_p": { "type": "number", "format": "float", "default": "null", "example": 0.95, "nullable": true, "maximum": 1, "exclusiveMinimum": 0 }, "watermark": { "type": "boolean", "default": "false", "example": true } } }, "GenerateRequest": { "type": "object", "required": [ "inputs" ], "properties": { "inputs": { "type": "string", "example": "My name is Olivier and I" }, "parameters": { "$ref": "#/components/schemas/GenerateParameters" } } }, "GenerateResponse": { "type": "object", "required": [ "generated_text" ], "properties": { "details": { "allOf": [ { "$ref": "#/components/schemas/Details" } ], "nullable": true }, "generated_text": { "type": "string", "example": "test" } } }, "GrammarType": { "oneOf": [ { "type": "object", "required": [ "type", "value" ], "properties": { "type": { "type": "string", "enum": [ "json" ] }, "value": { "description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions." } } }, { "type": "object", "required": [ "type", "value" ], "properties": { "type": { "type": "string", "enum": [ "regex" ] }, "value": { "type": "string" } } } ], "discriminator": { "propertyName": "type" } }, "Info": { "type": "object", "required": [ "model_id", "model_dtype", "model_device_type", "max_concurrent_requests", "max_best_of", "max_stop_sequences", "max_input_length", "max_total_tokens", "waiting_served_ratio", "max_batch_total_tokens", "max_waiting_tokens", "validation_workers", "version" ], "properties": { "docker_label": { "type": "string", "example": "null", "nullable": true }, "max_batch_size": { "type": "integer", "example": "null", "nullable": true, "minimum": 0 }, "max_batch_total_tokens": { "type": "integer", "format": "int32", "example": "32000", "minimum": 0 }, "max_best_of": { "type": "integer", "example": "2", "minimum": 0 }, "max_concurrent_requests": { "type": "integer", "description": "Router Parameters", "example": "128", "minimum": 0 }, "max_input_length": { "type": "integer", "example": "1024", "minimum": 0 }, "max_stop_sequences": { "type": "integer", "example": "4", "minimum": 0 }, "max_total_tokens": { "type": "integer", "example": "2048", "minimum": 0 }, "max_waiting_tokens": { "type": "integer", "example": "20", "minimum": 0 }, "model_device_type": { "type": "string", "example": "cuda" }, "model_dtype": { "type": "string", "example": "torch.float16" }, "model_id": { "type": "string", "description": "Model info", "example": "bigscience/blomm-560m" }, "model_pipeline_tag": { "type": "string", "example": "text-generation", "nullable": true }, "model_sha": { "type": "string", "example": "e985a63cdc139290c5f700ff1929f0b5942cced2", "nullable": true }, "sha": { "type": "string", "example": "null", "nullable": true }, "validation_workers": { "type": "integer", "example": "2", "minimum": 0 }, "version": { "type": "string", "description": "Router Info", "example": "0.5.0" }, "waiting_served_ratio": { "type": "number", "format": "float", "example": "1.2" } } }, "Message": { "type": "object", "required": [ "role", "content" ], "properties": { "content": { "type": "string", "example": "My name is David and I" }, "name": { "type": "string", "example": "\"David\"", "nullable": true }, "role": { "type": "string", "example": "user" } } }, "PrefillToken": { "type": "object", "required": [ "id", "text", "logprob" ], "properties": { "id": { "type": "integer", "format": "int32", "example": 0, "minimum": 0 }, "logprob": { "type": "number", "format": "float", "example": -0.34, "nullable": true }, "text": { "type": "string", "example": "test" } } }, "SimpleToken": { "type": "object", "required": [ "id", "text", "start", "stop" ], "properties": { "id": { "type": "integer", "format": "int32", "example": 0, "minimum": 0 }, "start": { "type": "integer", "example": 0, "minimum": 0 }, "stop": { "type": "integer", "example": 2, "minimum": 0 }, "text": { "type": "string", "example": "test" } } }, "StreamDetails": { "type": "object", "required": [ "finish_reason", "generated_tokens" ], "properties": { "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0 }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0 } } }, "StreamResponse": { "type": "object", "required": [ "index", "token" ], "properties": { "details": { "allOf": [ { "$ref": "#/components/schemas/StreamDetails" } ], "default": "null", "nullable": true }, "generated_text": { "type": "string", "default": "null", "example": "test", "nullable": true }, "index": { "type": "integer", "format": "int32", "minimum": 0 }, "token": { "$ref": "#/components/schemas/Token" }, "top_tokens": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } } } }, "Token": { "type": "object", "required": [ "id", "text", "logprob", "special" ], "properties": { "id": { "type": "integer", "format": "int32", "example": 0, "minimum": 0 }, "logprob": { "type": "number", "format": "float", "example": -0.34, "nullable": true }, "special": { "type": "boolean", "example": "false" }, "text": { "type": "string", "example": "test" } } }, "TokenizeResponse": { "type": "array", "items": { "$ref": "#/components/schemas/SimpleToken" } }, "Usage": { "type": "object", "required": [ "prompt_tokens", "completion_tokens", "total_tokens" ], "properties": { "completion_tokens": { "type": "integer", "format": "int32", "minimum": 0 }, "prompt_tokens": { "type": "integer", "format": "int32", "minimum": 0 }, "total_tokens": { "type": "integer", "format": "int32", "minimum": 0 } } } } }, "tags": [ { "name": "Text Generation Inference", "description": "Hugging Face Text Generation Inference API" } ] }