{ "openapi": "3.0.3", "info": { "title": "Text Generation Inference", "description": "Text Generation Webserver", "contact": { "name": "Olivier Dehaene" }, "license": { "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" }, "version": "0.9.4" }, "paths": { "/": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`", "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`", "operationId": "compat_generate", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CompatGenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateResponse" } }, "text/event-stream": { "schema": { "$ref": "#/components/schemas/StreamResponse" } } } }, "422": { "description": "Input validation error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/generate": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate tokens", "description": "Generate tokens", "operationId": "generate", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateResponse" } } } }, "422": { "description": "Input validation error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/generate_stream": { "post": { "tags": [ "Text Generation Inference" ], "summary": "Generate a stream of token using Server-Sent Events", "description": "Generate a stream of token using Server-Sent Events", "operationId": "generate_stream", "requestBody": { "content": { "application/json": { "schema": { "$ref": "#/components/schemas/GenerateRequest" } } }, "required": true }, "responses": { "200": { "description": "Generated Text", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/StreamResponse" } } } }, "422": { "description": "Input validation error", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Input validation error" } } } }, "424": { "description": "Generation Error", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Request failed during generation" } } } }, "429": { "description": "Model is overloaded", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Model is overloaded" } } } }, "500": { "description": "Incomplete generation", "content": { "text/event-stream": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "Incomplete generation" } } } } } } }, "/health": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Health check method", "description": "Health check method", "operationId": "health", "responses": { "200": { "description": "Everything is working fine" }, "503": { "description": "Text generation inference is down", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ErrorResponse" }, "example": { "error": "unhealthy", "error_type": "healthcheck" } } } } } } }, "/info": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Text Generation Inference endpoint info", "description": "Text Generation Inference endpoint info", "operationId": "get_model_info", "responses": { "200": { "description": "Served model info", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Info" } } } } } } }, "/metrics": { "get": { "tags": [ "Text Generation Inference" ], "summary": "Prometheus metrics scrape endpoint", "description": "Prometheus metrics scrape endpoint", "operationId": "metrics", "responses": { "200": { "description": "Prometheus Metrics", "content": { "text/plain": { "schema": { "type": "string" } } } } } } } }, "components": { "schemas": { "BestOfSequence": { "type": "object", "required": [ "generated_text", "finish_reason", "generated_tokens", "prefill", "tokens" ], "properties": { "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_text": { "type": "string", "example": "test" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0.0 }, "prefill": { "type": "array", "items": { "$ref": "#/components/schemas/PrefillToken" } }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0.0 }, "tokens": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } } } }, "CompatGenerateRequest": { "type": "object", "required": [ "inputs" ], "properties": { "inputs": { "type": "string", "example": "My name is Olivier and I" }, "parameters": { "$ref": "#/components/schemas/GenerateParameters" }, "stream": { "type": "boolean", "default": "false" } } }, "Details": { "type": "object", "required": [ "finish_reason", "generated_tokens", "prefill", "tokens" ], "properties": { "best_of_sequences": { "type": "array", "items": { "$ref": "#/components/schemas/BestOfSequence" }, "nullable": true }, "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0.0 }, "prefill": { "type": "array", "items": { "$ref": "#/components/schemas/PrefillToken" } }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0.0 }, "tokens": { "type": "array", "items": { "$ref": "#/components/schemas/Token" } } } }, "ErrorResponse": { "type": "object", "required": [ "error", "error_type" ], "properties": { "error": { "type": "string" }, "error_type": { "type": "string" } } }, "FinishReason": { "type": "string", "enum": [ "length", "eos_token", "stop_sequence" ] }, "GenerateParameters": { "type": "object", "properties": { "best_of": { "type": "integer", "default": "null", "example": 1, "nullable": true, "minimum": 0.0, "exclusiveMinimum": 0.0 }, "decoder_input_details": { "type": "boolean", "default": "true" }, "details": { "type": "boolean", "default": "true" }, "do_sample": { "type": "boolean", "default": "false", "example": true }, "max_new_tokens": { "type": "integer", "format": "int32", "default": "20", "minimum": 0.0, "exclusiveMaximum": 512.0, "exclusiveMinimum": 0.0 }, "repetition_penalty": { "type": "number", "format": "float", "default": "null", "example": 1.03, "nullable": true, "exclusiveMinimum": 0.0 }, "return_full_text": { "type": "boolean", "default": "null", "example": false, "nullable": true }, "seed": { "type": "integer", "format": "int64", "default": "null", "example": "null", "nullable": true, "minimum": 0.0, "exclusiveMinimum": 0.0 }, "stop": { "type": "array", "items": { "type": "string" }, "example": [ "photographer" ], "maxItems": 4 }, "temperature": { "type": "number", "format": "float", "default": "null", "example": 0.5, "nullable": true, "exclusiveMinimum": 0.0 }, "top_k": { "type": "integer", "format": "int32", "default": "null", "example": 10, "nullable": true, "exclusiveMinimum": 0.0 }, "top_p": { "type": "number", "format": "float", "default": "null", "example": 0.95, "nullable": true, "maximum": 1.0, "exclusiveMinimum": 0.0 }, "truncate": { "type": "integer", "default": "null", "example": "null", "nullable": true, "minimum": 0.0 }, "typical_p": { "type": "number", "format": "float", "default": "null", "example": 0.95, "nullable": true, "maximum": 1.0, "exclusiveMinimum": 0.0 }, "watermark": { "type": "boolean", "default": "false", "example": true } } }, "GenerateRequest": { "type": "object", "required": [ "inputs" ], "properties": { "inputs": { "type": "string", "example": "My name is Olivier and I" }, "parameters": { "$ref": "#/components/schemas/GenerateParameters" } } }, "GenerateResponse": { "type": "object", "required": [ "generated_text" ], "properties": { "details": { "allOf": [ { "$ref": "#/components/schemas/Details" } ], "nullable": true }, "generated_text": { "type": "string", "example": "test" } } }, "Info": { "type": "object", "required": [ "model_id", "model_dtype", "model_device_type", "max_concurrent_requests", "max_best_of", "max_stop_sequences", "max_input_length", "max_total_tokens", "waiting_served_ratio", "max_batch_total_tokens", "max_waiting_tokens", "validation_workers", "version" ], "properties": { "docker_label": { "type": "string", "example": "null", "nullable": true }, "max_batch_total_tokens": { "type": "integer", "format": "int32", "example": "32000", "minimum": 0.0 }, "max_best_of": { "type": "integer", "example": "2", "minimum": 0.0 }, "max_concurrent_requests": { "type": "integer", "description": "Router Parameters", "example": "128", "minimum": 0.0 }, "max_input_length": { "type": "integer", "example": "1024", "minimum": 0.0 }, "max_stop_sequences": { "type": "integer", "example": "4", "minimum": 0.0 }, "max_total_tokens": { "type": "integer", "example": "2048", "minimum": 0.0 }, "max_waiting_tokens": { "type": "integer", "example": "20", "minimum": 0.0 }, "model_device_type": { "type": "string", "example": "cuda" }, "model_dtype": { "type": "string", "example": "torch.float16" }, "model_id": { "type": "string", "description": "Model info", "example": "bigscience/blomm-560m" }, "model_pipeline_tag": { "type": "string", "example": "text-generation", "nullable": true }, "model_sha": { "type": "string", "example": "e985a63cdc139290c5f700ff1929f0b5942cced2", "nullable": true }, "sha": { "type": "string", "example": "null", "nullable": true }, "validation_workers": { "type": "integer", "example": "2", "minimum": 0.0 }, "version": { "type": "string", "description": "Router Info", "example": "0.5.0" }, "waiting_served_ratio": { "type": "number", "format": "float", "example": "1.2" } } }, "PrefillToken": { "type": "object", "required": [ "id", "text", "logprob" ], "properties": { "id": { "type": "integer", "format": "int32", "example": 0, "minimum": 0.0 }, "logprob": { "type": "number", "format": "float", "example": -0.34, "nullable": true }, "text": { "type": "string", "example": "test" } } }, "StreamDetails": { "type": "object", "required": [ "finish_reason", "generated_tokens" ], "properties": { "finish_reason": { "$ref": "#/components/schemas/FinishReason" }, "generated_tokens": { "type": "integer", "format": "int32", "example": 1, "minimum": 0.0 }, "seed": { "type": "integer", "format": "int64", "example": 42, "nullable": true, "minimum": 0.0 } } }, "StreamResponse": { "type": "object", "required": [ "token" ], "properties": { "details": { "allOf": [ { "$ref": "#/components/schemas/StreamDetails" } ], "nullable": true }, "generated_text": { "type": "string", "default": "null", "example": "test", "nullable": true }, "token": { "$ref": "#/components/schemas/Token" } } }, "Token": { "type": "object", "required": [ "id", "text", "logprob", "special" ], "properties": { "id": { "type": "integer", "format": "int32", "example": 0, "minimum": 0.0 }, "logprob": { "type": "number", "format": "float", "example": -0.34, "nullable": true }, "special": { "type": "boolean", "example": "false" }, "text": { "type": "string", "example": "test" } } } } }, "tags": [ { "name": "Text Generation Inference", "description": "Hugging Face Text Generation Inference API" } ] }