884 lines
22 KiB
JSON
884 lines
22 KiB
JSON
{
|
|
"openapi": "3.0.3",
|
|
"info": {
|
|
"title": "Text Generation Inference",
|
|
"description": "Text Generation Webserver",
|
|
"contact": {
|
|
"name": "Olivier Dehaene"
|
|
},
|
|
"license": {
|
|
"name": "Apache 2.0",
|
|
"url": "https://www.apache.org/licenses/LICENSE-2.0"
|
|
},
|
|
"version": "1.2.0"
|
|
},
|
|
"paths": {
|
|
"/": {
|
|
"post": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
|
|
"description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
|
|
"operationId": "compat_generate",
|
|
"requestBody": {
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/CompatGenerateRequest"
|
|
}
|
|
}
|
|
},
|
|
"required": true
|
|
},
|
|
"responses": {
|
|
"200": {
|
|
"description": "Generated Text",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/GenerateResponse"
|
|
}
|
|
},
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/StreamResponse"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"422": {
|
|
"description": "Input validation error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Input validation error"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"424": {
|
|
"description": "Generation Error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Request failed during generation"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"429": {
|
|
"description": "Model is overloaded",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Model is overloaded"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"500": {
|
|
"description": "Incomplete generation",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Incomplete generation"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/generate": {
|
|
"post": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Generate tokens",
|
|
"description": "Generate tokens",
|
|
"operationId": "generate",
|
|
"requestBody": {
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/GenerateRequest"
|
|
}
|
|
}
|
|
},
|
|
"required": true
|
|
},
|
|
"responses": {
|
|
"200": {
|
|
"description": "Generated Text",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/GenerateResponse"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"422": {
|
|
"description": "Input validation error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Input validation error"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"424": {
|
|
"description": "Generation Error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Request failed during generation"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"429": {
|
|
"description": "Model is overloaded",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Model is overloaded"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"500": {
|
|
"description": "Incomplete generation",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Incomplete generation"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/generate_stream": {
|
|
"post": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Generate a stream of token using Server-Sent Events",
|
|
"description": "Generate a stream of token using Server-Sent Events",
|
|
"operationId": "generate_stream",
|
|
"requestBody": {
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/GenerateRequest"
|
|
}
|
|
}
|
|
},
|
|
"required": true
|
|
},
|
|
"responses": {
|
|
"200": {
|
|
"description": "Generated Text",
|
|
"content": {
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/StreamResponse"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"422": {
|
|
"description": "Input validation error",
|
|
"content": {
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Input validation error"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"424": {
|
|
"description": "Generation Error",
|
|
"content": {
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Request failed during generation"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"429": {
|
|
"description": "Model is overloaded",
|
|
"content": {
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Model is overloaded"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"500": {
|
|
"description": "Incomplete generation",
|
|
"content": {
|
|
"text/event-stream": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "Incomplete generation"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/health": {
|
|
"get": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Health check method",
|
|
"description": "Health check method",
|
|
"operationId": "health",
|
|
"responses": {
|
|
"200": {
|
|
"description": "Everything is working fine"
|
|
},
|
|
"503": {
|
|
"description": "Text generation inference is down",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ErrorResponse"
|
|
},
|
|
"example": {
|
|
"error": "unhealthy",
|
|
"error_type": "healthcheck"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/info": {
|
|
"get": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Text Generation Inference endpoint info",
|
|
"description": "Text Generation Inference endpoint info",
|
|
"operationId": "get_model_info",
|
|
"responses": {
|
|
"200": {
|
|
"description": "Served model info",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Info"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/metrics": {
|
|
"get": {
|
|
"tags": [
|
|
"Text Generation Inference"
|
|
],
|
|
"summary": "Prometheus metrics scrape endpoint",
|
|
"description": "Prometheus metrics scrape endpoint",
|
|
"operationId": "metrics",
|
|
"responses": {
|
|
"200": {
|
|
"description": "Prometheus Metrics",
|
|
"content": {
|
|
"text/plain": {
|
|
"schema": {
|
|
"type": "string"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"components": {
|
|
"schemas": {
|
|
"BestOfSequence": {
|
|
"type": "object",
|
|
"required": [
|
|
"generated_text",
|
|
"finish_reason",
|
|
"generated_tokens",
|
|
"prefill",
|
|
"tokens"
|
|
],
|
|
"properties": {
|
|
"finish_reason": {
|
|
"$ref": "#/components/schemas/FinishReason"
|
|
},
|
|
"generated_text": {
|
|
"type": "string",
|
|
"example": "test"
|
|
},
|
|
"generated_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": 1,
|
|
"minimum": 0
|
|
},
|
|
"prefill": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/PrefillToken"
|
|
}
|
|
},
|
|
"seed": {
|
|
"type": "integer",
|
|
"format": "int64",
|
|
"example": 42,
|
|
"nullable": true,
|
|
"minimum": 0
|
|
},
|
|
"tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Token"
|
|
}
|
|
},
|
|
"top_tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Token"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"CompatGenerateRequest": {
|
|
"type": "object",
|
|
"required": [
|
|
"inputs"
|
|
],
|
|
"properties": {
|
|
"inputs": {
|
|
"type": "string",
|
|
"example": "My name is Olivier and I"
|
|
},
|
|
"parameters": {
|
|
"$ref": "#/components/schemas/GenerateParameters"
|
|
},
|
|
"stream": {
|
|
"type": "boolean",
|
|
"default": "false"
|
|
}
|
|
}
|
|
},
|
|
"Details": {
|
|
"type": "object",
|
|
"required": [
|
|
"finish_reason",
|
|
"generated_tokens",
|
|
"prefill",
|
|
"tokens"
|
|
],
|
|
"properties": {
|
|
"best_of_sequences": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/BestOfSequence"
|
|
},
|
|
"nullable": true
|
|
},
|
|
"finish_reason": {
|
|
"$ref": "#/components/schemas/FinishReason"
|
|
},
|
|
"generated_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": 1,
|
|
"minimum": 0
|
|
},
|
|
"prefill": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/PrefillToken"
|
|
}
|
|
},
|
|
"seed": {
|
|
"type": "integer",
|
|
"format": "int64",
|
|
"example": 42,
|
|
"nullable": true,
|
|
"minimum": 0
|
|
},
|
|
"tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Token"
|
|
}
|
|
},
|
|
"top_tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Token"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"ErrorResponse": {
|
|
"type": "object",
|
|
"required": [
|
|
"error",
|
|
"error_type"
|
|
],
|
|
"properties": {
|
|
"error": {
|
|
"type": "string"
|
|
},
|
|
"error_type": {
|
|
"type": "string"
|
|
}
|
|
}
|
|
},
|
|
"FinishReason": {
|
|
"type": "string",
|
|
"enum": [
|
|
"length",
|
|
"eos_token",
|
|
"stop_sequence"
|
|
]
|
|
},
|
|
"GenerateParameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"best_of": {
|
|
"type": "integer",
|
|
"default": "null",
|
|
"example": 1,
|
|
"nullable": true,
|
|
"minimum": 0,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"decoder_input_details": {
|
|
"type": "boolean",
|
|
"default": "true"
|
|
},
|
|
"details": {
|
|
"type": "boolean",
|
|
"default": "true"
|
|
},
|
|
"do_sample": {
|
|
"type": "boolean",
|
|
"default": "false",
|
|
"example": true
|
|
},
|
|
"max_new_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"default": "null",
|
|
"example": "20",
|
|
"nullable": true,
|
|
"minimum": 0
|
|
},
|
|
"repetition_penalty": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"default": "null",
|
|
"example": 1.03,
|
|
"nullable": true,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"return_full_text": {
|
|
"type": "boolean",
|
|
"default": "null",
|
|
"example": false,
|
|
"nullable": true
|
|
},
|
|
"seed": {
|
|
"type": "integer",
|
|
"format": "int64",
|
|
"default": "null",
|
|
"example": "null",
|
|
"nullable": true,
|
|
"minimum": 0,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"stop": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "string"
|
|
},
|
|
"example": [
|
|
"photographer"
|
|
],
|
|
"maxItems": 4
|
|
},
|
|
"temperature": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"default": "null",
|
|
"example": 0.5,
|
|
"nullable": true,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"top_k": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"default": "null",
|
|
"example": 10,
|
|
"nullable": true,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"top_n_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"default": "null",
|
|
"example": 5,
|
|
"nullable": true,
|
|
"minimum": 0,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"top_p": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"default": "null",
|
|
"example": 0.95,
|
|
"nullable": true,
|
|
"maximum": 1,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"truncate": {
|
|
"type": "integer",
|
|
"default": "null",
|
|
"example": "null",
|
|
"nullable": true,
|
|
"minimum": 0
|
|
},
|
|
"typical_p": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"default": "null",
|
|
"example": 0.95,
|
|
"nullable": true,
|
|
"maximum": 1,
|
|
"exclusiveMinimum": 0
|
|
},
|
|
"watermark": {
|
|
"type": "boolean",
|
|
"default": "false",
|
|
"example": true
|
|
}
|
|
}
|
|
},
|
|
"GenerateRequest": {
|
|
"type": "object",
|
|
"required": [
|
|
"inputs"
|
|
],
|
|
"properties": {
|
|
"inputs": {
|
|
"type": "string",
|
|
"example": "My name is Olivier and I"
|
|
},
|
|
"parameters": {
|
|
"$ref": "#/components/schemas/GenerateParameters"
|
|
}
|
|
}
|
|
},
|
|
"GenerateResponse": {
|
|
"type": "object",
|
|
"required": [
|
|
"generated_text"
|
|
],
|
|
"properties": {
|
|
"details": {
|
|
"allOf": [
|
|
{
|
|
"$ref": "#/components/schemas/Details"
|
|
}
|
|
],
|
|
"nullable": true
|
|
},
|
|
"generated_text": {
|
|
"type": "string",
|
|
"example": "test"
|
|
}
|
|
}
|
|
},
|
|
"Info": {
|
|
"type": "object",
|
|
"required": [
|
|
"model_id",
|
|
"model_dtype",
|
|
"model_device_type",
|
|
"max_concurrent_requests",
|
|
"max_best_of",
|
|
"max_stop_sequences",
|
|
"max_input_length",
|
|
"max_total_tokens",
|
|
"waiting_served_ratio",
|
|
"max_batch_total_tokens",
|
|
"max_waiting_tokens",
|
|
"validation_workers",
|
|
"version"
|
|
],
|
|
"properties": {
|
|
"docker_label": {
|
|
"type": "string",
|
|
"example": "null",
|
|
"nullable": true
|
|
},
|
|
"max_batch_total_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": "32000",
|
|
"minimum": 0
|
|
},
|
|
"max_best_of": {
|
|
"type": "integer",
|
|
"example": "2",
|
|
"minimum": 0
|
|
},
|
|
"max_concurrent_requests": {
|
|
"type": "integer",
|
|
"description": "Router Parameters",
|
|
"example": "128",
|
|
"minimum": 0
|
|
},
|
|
"max_input_length": {
|
|
"type": "integer",
|
|
"example": "1024",
|
|
"minimum": 0
|
|
},
|
|
"max_stop_sequences": {
|
|
"type": "integer",
|
|
"example": "4",
|
|
"minimum": 0
|
|
},
|
|
"max_total_tokens": {
|
|
"type": "integer",
|
|
"example": "2048",
|
|
"minimum": 0
|
|
},
|
|
"max_waiting_tokens": {
|
|
"type": "integer",
|
|
"example": "20",
|
|
"minimum": 0
|
|
},
|
|
"model_device_type": {
|
|
"type": "string",
|
|
"example": "cuda"
|
|
},
|
|
"model_dtype": {
|
|
"type": "string",
|
|
"example": "torch.float16"
|
|
},
|
|
"model_id": {
|
|
"type": "string",
|
|
"description": "Model info",
|
|
"example": "bigscience/blomm-560m"
|
|
},
|
|
"model_pipeline_tag": {
|
|
"type": "string",
|
|
"example": "text-generation",
|
|
"nullable": true
|
|
},
|
|
"model_sha": {
|
|
"type": "string",
|
|
"example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
|
|
"nullable": true
|
|
},
|
|
"sha": {
|
|
"type": "string",
|
|
"example": "null",
|
|
"nullable": true
|
|
},
|
|
"validation_workers": {
|
|
"type": "integer",
|
|
"example": "2",
|
|
"minimum": 0
|
|
},
|
|
"version": {
|
|
"type": "string",
|
|
"description": "Router Info",
|
|
"example": "0.5.0"
|
|
},
|
|
"waiting_served_ratio": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"example": "1.2"
|
|
}
|
|
}
|
|
},
|
|
"PrefillToken": {
|
|
"type": "object",
|
|
"required": [
|
|
"id",
|
|
"text",
|
|
"logprob"
|
|
],
|
|
"properties": {
|
|
"id": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": 0,
|
|
"minimum": 0
|
|
},
|
|
"logprob": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"example": -0.34,
|
|
"nullable": true
|
|
},
|
|
"text": {
|
|
"type": "string",
|
|
"example": "test"
|
|
}
|
|
}
|
|
},
|
|
"StreamDetails": {
|
|
"type": "object",
|
|
"required": [
|
|
"finish_reason",
|
|
"generated_tokens"
|
|
],
|
|
"properties": {
|
|
"finish_reason": {
|
|
"$ref": "#/components/schemas/FinishReason"
|
|
},
|
|
"generated_tokens": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": 1,
|
|
"minimum": 0
|
|
},
|
|
"seed": {
|
|
"type": "integer",
|
|
"format": "int64",
|
|
"example": 42,
|
|
"nullable": true,
|
|
"minimum": 0
|
|
}
|
|
}
|
|
},
|
|
"StreamResponse": {
|
|
"type": "object",
|
|
"required": [
|
|
"token"
|
|
],
|
|
"properties": {
|
|
"details": {
|
|
"allOf": [
|
|
{
|
|
"$ref": "#/components/schemas/StreamDetails"
|
|
}
|
|
],
|
|
"default": "null",
|
|
"nullable": true
|
|
},
|
|
"generated_text": {
|
|
"type": "string",
|
|
"default": "null",
|
|
"example": "test",
|
|
"nullable": true
|
|
},
|
|
"token": {
|
|
"$ref": "#/components/schemas/Token"
|
|
},
|
|
"top_tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Token"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"Token": {
|
|
"type": "object",
|
|
"required": [
|
|
"id",
|
|
"text",
|
|
"logprob",
|
|
"special"
|
|
],
|
|
"properties": {
|
|
"id": {
|
|
"type": "integer",
|
|
"format": "int32",
|
|
"example": 0,
|
|
"minimum": 0
|
|
},
|
|
"logprob": {
|
|
"type": "number",
|
|
"format": "float",
|
|
"example": -0.34,
|
|
"nullable": true
|
|
},
|
|
"special": {
|
|
"type": "boolean",
|
|
"example": "false"
|
|
},
|
|
"text": {
|
|
"type": "string",
|
|
"example": "test"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"tags": [
|
|
{
|
|
"name": "Text Generation Inference",
|
|
"description": "Hugging Face Text Generation Inference API"
|
|
}
|
|
]
|
|
}
|