diff --git a/docs/openapi.json b/docs/openapi.json index df2d427f..4454259b 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -1,883 +1 @@ -{ - "openapi": "3.0.3", - "info": { - "title": "Text Generation Inference", - "description": "Text Generation Webserver", - "contact": { - "name": "Olivier Dehaene" - }, - "license": { - "name": "Apache 2.0", - "url": "https://www.apache.org/licenses/LICENSE-2.0" - }, - "version": "1.3.4" - }, - "paths": { - "/": { - "post": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`", - "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`", - "operationId": "compat_generate", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/CompatGenerateRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Generated Text", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GenerateResponse" - } - }, - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/StreamResponse" - } - } - } - }, - "422": { - "description": "Input validation error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Input validation error" - } - } - } - }, - "424": { - "description": "Generation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Request failed during generation" - } - } - } - }, - "429": { - "description": "Model is overloaded", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Model is overloaded" - } - } - } - }, - "500": { - "description": "Incomplete generation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Incomplete generation" - } - } - } - } - } - } - }, - "/generate": { - "post": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Generate tokens", - "description": "Generate tokens", - "operationId": "generate", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GenerateRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Generated Text", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GenerateResponse" - } - } - } - }, - "422": { - "description": "Input validation error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Input validation error" - } - } - } - }, - "424": { - "description": "Generation Error", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Request failed during generation" - } - } - } - }, - "429": { - "description": "Model is overloaded", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Model is overloaded" - } - } - } - }, - "500": { - "description": "Incomplete generation", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Incomplete generation" - } - } - } - } - } - } - }, - "/generate_stream": { - "post": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Generate a stream of token using Server-Sent Events", - "description": "Generate a stream of token using Server-Sent Events", - "operationId": "generate_stream", - "requestBody": { - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/GenerateRequest" - } - } - }, - "required": true - }, - "responses": { - "200": { - "description": "Generated Text", - "content": { - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/StreamResponse" - } - } - } - }, - "422": { - "description": "Input validation error", - "content": { - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Input validation error" - } - } - } - }, - "424": { - "description": "Generation Error", - "content": { - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Request failed during generation" - } - } - } - }, - "429": { - "description": "Model is overloaded", - "content": { - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Model is overloaded" - } - } - } - }, - "500": { - "description": "Incomplete generation", - "content": { - "text/event-stream": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "Incomplete generation" - } - } - } - } - } - } - }, - "/health": { - "get": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Health check method", - "description": "Health check method", - "operationId": "health", - "responses": { - "200": { - "description": "Everything is working fine" - }, - "503": { - "description": "Text generation inference is down", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/ErrorResponse" - }, - "example": { - "error": "unhealthy", - "error_type": "healthcheck" - } - } - } - } - } - } - }, - "/info": { - "get": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Text Generation Inference endpoint info", - "description": "Text Generation Inference endpoint info", - "operationId": "get_model_info", - "responses": { - "200": { - "description": "Served model info", - "content": { - "application/json": { - "schema": { - "$ref": "#/components/schemas/Info" - } - } - } - } - } - } - }, - "/metrics": { - "get": { - "tags": [ - "Text Generation Inference" - ], - "summary": "Prometheus metrics scrape endpoint", - "description": "Prometheus metrics scrape endpoint", - "operationId": "metrics", - "responses": { - "200": { - "description": "Prometheus Metrics", - "content": { - "text/plain": { - "schema": { - "type": "string" - } - } - } - } - } - } - } - }, - "components": { - "schemas": { - "BestOfSequence": { - "type": "object", - "required": [ - "generated_text", - "finish_reason", - "generated_tokens", - "prefill", - "tokens" - ], - "properties": { - "finish_reason": { - "$ref": "#/components/schemas/FinishReason" - }, - "generated_text": { - "type": "string", - "example": "test" - }, - "generated_tokens": { - "type": "integer", - "format": "int32", - "example": 1, - "minimum": 0 - }, - "prefill": { - "type": "array", - "items": { - "$ref": "#/components/schemas/PrefillToken" - } - }, - "seed": { - "type": "integer", - "format": "int64", - "example": 42, - "nullable": true, - "minimum": 0 - }, - "tokens": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Token" - } - }, - "top_tokens": { - "type": "array", - "items": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Token" - } - } - } - } - }, - "CompatGenerateRequest": { - "type": "object", - "required": [ - "inputs" - ], - "properties": { - "inputs": { - "type": "string", - "example": "My name is Olivier and I" - }, - "parameters": { - "$ref": "#/components/schemas/GenerateParameters" - }, - "stream": { - "type": "boolean", - "default": "false" - } - } - }, - "Details": { - "type": "object", - "required": [ - "finish_reason", - "generated_tokens", - "prefill", - "tokens" - ], - "properties": { - "best_of_sequences": { - "type": "array", - "items": { - "$ref": "#/components/schemas/BestOfSequence" - }, - "nullable": true - }, - "finish_reason": { - "$ref": "#/components/schemas/FinishReason" - }, - "generated_tokens": { - "type": "integer", - "format": "int32", - "example": 1, - "minimum": 0 - }, - "prefill": { - "type": "array", - "items": { - "$ref": "#/components/schemas/PrefillToken" - } - }, - "seed": { - "type": "integer", - "format": "int64", - "example": 42, - "nullable": true, - "minimum": 0 - }, - "tokens": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Token" - } - }, - "top_tokens": { - "type": "array", - "items": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Token" - } - } - } - } - }, - "ErrorResponse": { - "type": "object", - "required": [ - "error", - "error_type" - ], - "properties": { - "error": { - "type": "string" - }, - "error_type": { - "type": "string" - } - } - }, - "FinishReason": { - "type": "string", - "enum": [ - "length", - "eos_token", - "stop_sequence" - ] - }, - "GenerateParameters": { - "type": "object", - "properties": { - "best_of": { - "type": "integer", - "default": "null", - "example": 1, - "nullable": true, - "minimum": 0, - "exclusiveMinimum": 0 - }, - "decoder_input_details": { - "type": "boolean", - "default": "true" - }, - "details": { - "type": "boolean", - "default": "true" - }, - "do_sample": { - "type": "boolean", - "default": "false", - "example": true - }, - "max_new_tokens": { - "type": "integer", - "format": "int32", - "default": "20", - "example": "20", - "nullable": true, - "minimum": 0 - }, - "repetition_penalty": { - "type": "number", - "format": "float", - "default": "null", - "example": 1.03, - "nullable": true, - "exclusiveMinimum": 0 - }, - "return_full_text": { - "type": "boolean", - "default": "null", - "example": false, - "nullable": true - }, - "seed": { - "type": "integer", - "format": "int64", - "default": "null", - "example": "null", - "nullable": true, - "minimum": 0, - "exclusiveMinimum": 0 - }, - "stop": { - "type": "array", - "items": { - "type": "string" - }, - "example": [ - "photographer" - ], - "maxItems": 4 - }, - "temperature": { - "type": "number", - "format": "float", - "default": "null", - "example": 0.5, - "nullable": true, - "exclusiveMinimum": 0 - }, - "top_k": { - "type": "integer", - "format": "int32", - "default": "null", - "example": 10, - "nullable": true, - "exclusiveMinimum": 0 - }, - "top_n_tokens": { - "type": "integer", - "format": "int32", - "default": "null", - "example": 5, - "nullable": true, - "minimum": 0, - "exclusiveMinimum": 0 - }, - "top_p": { - "type": "number", - "format": "float", - "default": "null", - "example": 0.95, - "nullable": true, - "maximum": 1, - "exclusiveMinimum": 0 - }, - "truncate": { - "type": "integer", - "default": "null", - "example": "null", - "nullable": true, - "minimum": 0 - }, - "typical_p": { - "type": "number", - "format": "float", - "default": "null", - "example": 0.95, - "nullable": true, - "maximum": 1, - "exclusiveMinimum": 0 - }, - "watermark": { - "type": "boolean", - "default": "false", - "example": true - } - } - }, - "GenerateRequest": { - "type": "object", - "required": [ - "inputs" - ], - "properties": { - "inputs": { - "type": "string", - "example": "My name is Olivier and I" - }, - "parameters": { - "$ref": "#/components/schemas/GenerateParameters" - } - } - }, - "GenerateResponse": { - "type": "object", - "required": [ - "generated_text" - ], - "properties": { - "details": { - "allOf": [ - { - "$ref": "#/components/schemas/Details" - } - ], - "nullable": true - }, - "generated_text": { - "type": "string", - "example": "test" - } - } - }, - "Info": { - "type": "object", - "required": [ - "model_id", - "model_dtype", - "model_device_type", - "max_concurrent_requests", - "max_best_of", - "max_stop_sequences", - "max_input_length", - "max_total_tokens", - "waiting_served_ratio", - "max_batch_total_tokens", - "max_waiting_tokens", - "validation_workers", - "version" - ], - "properties": { - "docker_label": { - "type": "string", - "example": "null", - "nullable": true - }, - "max_batch_total_tokens": { - "type": "integer", - "format": "int32", - "example": "32000", - "minimum": 0 - }, - "max_best_of": { - "type": "integer", - "example": "2", - "minimum": 0 - }, - "max_concurrent_requests": { - "type": "integer", - "description": "Router Parameters", - "example": "128", - "minimum": 0 - }, - "max_input_length": { - "type": "integer", - "example": "1024", - "minimum": 0 - }, - "max_stop_sequences": { - "type": "integer", - "example": "4", - "minimum": 0 - }, - "max_total_tokens": { - "type": "integer", - "example": "2048", - "minimum": 0 - }, - "max_waiting_tokens": { - "type": "integer", - "example": "20", - "minimum": 0 - }, - "model_device_type": { - "type": "string", - "example": "cuda" - }, - "model_dtype": { - "type": "string", - "example": "torch.float16" - }, - "model_id": { - "type": "string", - "description": "Model info", - "example": "bigscience/blomm-560m" - }, - "model_pipeline_tag": { - "type": "string", - "example": "text-generation", - "nullable": true - }, - "model_sha": { - "type": "string", - "example": "e985a63cdc139290c5f700ff1929f0b5942cced2", - "nullable": true - }, - "sha": { - "type": "string", - "example": "null", - "nullable": true - }, - "validation_workers": { - "type": "integer", - "example": "2", - "minimum": 0 - }, - "version": { - "type": "string", - "description": "Router Info", - "example": "0.5.0" - }, - "waiting_served_ratio": { - "type": "number", - "format": "float", - "example": "1.2" - } - } - }, - "PrefillToken": { - "type": "object", - "required": [ - "id", - "text", - "logprob" - ], - "properties": { - "id": { - "type": "integer", - "format": "int32", - "example": 0, - "minimum": 0 - }, - "logprob": { - "type": "number", - "format": "float", - "example": -0.34, - "nullable": true - }, - "text": { - "type": "string", - "example": "test" - } - } - }, - "StreamDetails": { - "type": "object", - "required": [ - "finish_reason", - "generated_tokens" - ], - "properties": { - "finish_reason": { - "$ref": "#/components/schemas/FinishReason" - }, - "generated_tokens": { - "type": "integer", - "format": "int32", - "example": 1, - "minimum": 0 - }, - "seed": { - "type": "integer", - "format": "int64", - "example": 42, - "nullable": true, - "minimum": 0 - } - } - }, - "StreamResponse": { - "type": "object", - "required": [ - "token" - ], - "properties": { - "details": { - "allOf": [ - { - "$ref": "#/components/schemas/StreamDetails" - } - ], - "default": "null", - "nullable": true - }, - "generated_text": { - "type": "string", - "default": "null", - "example": "test", - "nullable": true - }, - "token": { - "$ref": "#/components/schemas/Token" - }, - "top_tokens": { - "type": "array", - "items": { - "$ref": "#/components/schemas/Token" - } - } - } - }, - "Token": { - "type": "object", - "required": [ - "id", - "text", - "logprob", - "special" - ], - "properties": { - "id": { - "type": "integer", - "format": "int32", - "example": 0, - "minimum": 0 - }, - "logprob": { - "type": "number", - "format": "float", - "example": -0.34, - "nullable": true - }, - "special": { - "type": "boolean", - "example": "false" - }, - "text": { - "type": "string", - "example": "test" - } - } - } - } - }, - "tags": [ - { - "name": "Text Generation Inference", - "description": "Hugging Face Text Generation Inference API" - } - ] -} +{"openapi":"3.0.3","info":{"title":"Text Generation Inference","description":"Text Generation Webserver","contact":{"name":"Olivier Dehaene"},"license":{"name":"Apache 2.0","url":"https://www.apache.org/licenses/LICENSE-2.0"},"version":"1.3.4"},"paths":{"/":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens if `stream == false` or a stream of token if `stream == true`","description":"Generate tokens if `stream == false` or a stream of token if `stream == true`","operationId":"compat_generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/CompatGenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}},"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"generate","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateResponse"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/generate_stream":{"post":{"tags":["Text Generation Inference"],"summary":"Generate a stream of token using Server-Sent Events","description":"Generate a stream of token using Server-Sent Events","operationId":"generate_stream","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/GenerateRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/StreamResponse"}}}},"422":{"description":"Input validation error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"text/event-stream":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}},"/health":{"get":{"tags":["Text Generation Inference"],"summary":"Health check method","description":"Health check method","operationId":"health","responses":{"200":{"description":"Everything is working fine"},"503":{"description":"Text generation inference is down","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"unhealthy","error_type":"healthcheck"}}}}}}},"/info":{"get":{"tags":["Text Generation Inference"],"summary":"Text Generation Inference endpoint info","description":"Text Generation Inference endpoint info","operationId":"get_model_info","responses":{"200":{"description":"Served model info","content":{"application/json":{"schema":{"$ref":"#/components/schemas/Info"}}}}}}},"/metrics":{"get":{"tags":["Text Generation Inference"],"summary":"Prometheus metrics scrape endpoint","description":"Prometheus metrics scrape endpoint","operationId":"metrics","responses":{"200":{"description":"Prometheus Metrics","content":{"text/plain":{"schema":{"type":"string"}}}}}}},"/tokenize":{"post":{"tags":["Text Generation Inference"],"summary":"Tokenize inputs","description":"Tokenize inputs","operationId":"tokenize","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeRequest"}}},"required":true},"responses":{"200":{"description":"Tokenized ids","content":{"application/json":{"schema":{"$ref":"#/components/schemas/TokenizeResponse"}}}},"404":{"description":"No tokenizer found","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"No fast tokenizer available"}}}}}}},"/v1/chat/completions":{"post":{"tags":["Text Generation Inference"],"summary":"Generate tokens","description":"Generate tokens","operationId":"chat_completions","requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatRequest"}}},"required":true},"responses":{"200":{"description":"Generated Text","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ChatCompletionChunk"}}}},"422":{"description":"Input validation error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Input validation error"}}}},"424":{"description":"Generation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Request failed during generation"}}}},"429":{"description":"Model is overloaded","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Model is overloaded"}}}},"500":{"description":"Incomplete generation","content":{"application/json":{"schema":{"$ref":"#/components/schemas/ErrorResponse"},"example":{"error":"Incomplete generation"}}}}}}}},"components":{"schemas":{"BestOfSequence":{"type":"object","required":["generated_text","finish_reason","generated_tokens","prefill","tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_text":{"type":"string","example":"test"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"CompatGenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"},"stream":{"type":"boolean","default":"false"}}},"Details":{"type":"object","required":["finish_reason","generated_tokens","prefill","tokens"],"properties":{"best_of_sequences":{"type":"array","items":{"$ref":"#/components/schemas/BestOfSequence"},"nullable":true},"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"prefill":{"type":"array","items":{"$ref":"#/components/schemas/PrefillToken"}},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0},"tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}},"top_tokens":{"type":"array","items":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}}},"ErrorResponse":{"type":"object","required":["error","error_type"],"properties":{"error":{"type":"string"},"error_type":{"type":"string"}}},"FinishReason":{"type":"string","enum":["length","eos_token","stop_sequence"]},"GenerateParameters":{"type":"object","properties":{"best_of":{"type":"integer","default":"null","example":1,"nullable":true,"minimum":0,"exclusiveMinimum":0},"decoder_input_details":{"type":"boolean","default":"true"},"details":{"type":"boolean","default":"true"},"do_sample":{"type":"boolean","default":"false","example":true},"max_new_tokens":{"type":"integer","format":"int32","default":"100","example":"20","nullable":true,"minimum":0},"repetition_penalty":{"type":"number","format":"float","default":"null","example":1.03,"nullable":true,"exclusiveMinimum":0},"return_full_text":{"type":"boolean","default":"null","example":false,"nullable":true},"seed":{"type":"integer","format":"int64","default":"null","example":"null","nullable":true,"minimum":0,"exclusiveMinimum":0},"stop":{"type":"array","items":{"type":"string"},"example":["photographer"],"maxItems":4},"temperature":{"type":"number","format":"float","default":"null","example":0.5,"nullable":true,"exclusiveMinimum":0},"top_k":{"type":"integer","format":"int32","default":"null","example":10,"nullable":true,"exclusiveMinimum":0},"top_n_tokens":{"type":"integer","format":"int32","default":"null","example":5,"nullable":true,"minimum":0,"exclusiveMinimum":0},"top_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"truncate":{"type":"integer","default":"null","example":"null","nullable":true,"minimum":0},"typical_p":{"type":"number","format":"float","default":"null","example":0.95,"nullable":true,"maximum":1,"exclusiveMinimum":0},"watermark":{"type":"boolean","default":"false","example":true}}},"GenerateRequest":{"type":"object","required":["inputs"],"properties":{"inputs":{"type":"string","example":"My name is Olivier and I"},"parameters":{"$ref":"#/components/schemas/GenerateParameters"}}},"GenerateResponse":{"type":"object","required":["generated_text"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/Details"}],"nullable":true},"generated_text":{"type":"string","example":"test"}}},"Info":{"type":"object","required":["model_id","model_dtype","model_device_type","max_concurrent_requests","max_best_of","max_stop_sequences","max_input_length","max_total_tokens","waiting_served_ratio","max_batch_total_tokens","max_waiting_tokens","validation_workers","version"],"properties":{"docker_label":{"type":"string","example":"null","nullable":true},"max_batch_total_tokens":{"type":"integer","format":"int32","example":"32000","minimum":0},"max_best_of":{"type":"integer","example":"2","minimum":0},"max_concurrent_requests":{"type":"integer","description":"Router Parameters","example":"128","minimum":0},"max_input_length":{"type":"integer","example":"1024","minimum":0},"max_stop_sequences":{"type":"integer","example":"4","minimum":0},"max_total_tokens":{"type":"integer","example":"2048","minimum":0},"max_waiting_tokens":{"type":"integer","example":"20","minimum":0},"model_device_type":{"type":"string","example":"cuda"},"model_dtype":{"type":"string","example":"torch.float16"},"model_id":{"type":"string","description":"Model info","example":"bigscience/blomm-560m"},"model_pipeline_tag":{"type":"string","example":"text-generation","nullable":true},"model_sha":{"type":"string","example":"e985a63cdc139290c5f700ff1929f0b5942cced2","nullable":true},"sha":{"type":"string","example":"null","nullable":true},"validation_workers":{"type":"integer","example":"2","minimum":0},"version":{"type":"string","description":"Router Info","example":"0.5.0"},"waiting_served_ratio":{"type":"number","format":"float","example":"1.2"}}},"PrefillToken":{"type":"object","required":["id","text","logprob"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"text":{"type":"string","example":"test"}}},"StreamDetails":{"type":"object","required":["finish_reason","generated_tokens"],"properties":{"finish_reason":{"$ref":"#/components/schemas/FinishReason"},"generated_tokens":{"type":"integer","format":"int32","example":1,"minimum":0},"seed":{"type":"integer","format":"int64","example":42,"nullable":true,"minimum":0}}},"StreamResponse":{"type":"object","required":["index","token"],"properties":{"details":{"allOf":[{"$ref":"#/components/schemas/StreamDetails"}],"default":"null","nullable":true},"generated_text":{"type":"string","default":"null","example":"test","nullable":true},"index":{"type":"integer","format":"int32","minimum":0},"token":{"$ref":"#/components/schemas/Token"},"top_tokens":{"type":"array","items":{"$ref":"#/components/schemas/Token"}}}},"Token":{"type":"object","required":["id","text","logprob","special"],"properties":{"id":{"type":"integer","format":"int32","example":0,"minimum":0},"logprob":{"type":"number","format":"float","example":-0.34,"nullable":true},"special":{"type":"boolean","example":"false"},"text":{"type":"string","example":"test"}}}}},"tags":[{"name":"Text Generation Inference","description":"Hugging Face Text Generation Inference API"}]} diff --git a/router/src/infer.rs b/router/src/infer.rs index 8a9875eb..5f078ba0 100644 --- a/router/src/infer.rs +++ b/router/src/infer.rs @@ -165,6 +165,28 @@ impl Infer { )) } + /// Tokenizer the input + #[instrument(skip_all)] + pub(crate) async fn tokenize( + &self, + request: GenerateRequest, + ) -> Result, InferError> { + // Tokenize request + let inputs = request.inputs; + let truncate = request.parameters.truncate; + let encoding = self + .validation + .tokenize(inputs, truncate) + .await + .map_err(|err| { + tracing::error!("Tokenization {err}"); + err + })?; + + // Return Encoding + Ok(encoding.map(|(encoding, _)| encoding)) + } + /// Apply the chat template to the chat request #[instrument(skip_all)] pub(crate) fn apply_chat_template(&self, messages: Vec) -> Result { diff --git a/router/src/lib.rs b/router/src/lib.rs index 894ab466..2bfbbacd 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -444,6 +444,18 @@ pub struct Token { special: bool, } +#[derive(Debug, Serialize, ToSchema)] +pub struct SimpleToken { + #[schema(example = 0)] + id: u32, + #[schema(example = "test")] + text: String, + #[schema(example = 0)] + start: usize, + #[schema(example = 2)] + stop: usize, +} + #[derive(Serialize, ToSchema)] #[serde(rename_all(serialize = "snake_case"))] pub(crate) enum FinishReason { diff --git a/router/src/server.rs b/router/src/server.rs index ff48b4f0..c5ca4665 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -5,8 +5,8 @@ use crate::validation::ValidationError; use crate::{ BestOfSequence, ChatCompletion, ChatCompletionChunk, ChatRequest, CompatGenerateRequest, Details, ErrorResponse, FinishReason, GenerateParameters, GenerateRequest, GenerateResponse, - HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, StreamDetails, StreamResponse, - Token, Validation, + HubModelInfo, HubTokenizerConfig, Infer, Info, PrefillToken, SimpleToken, StreamDetails, + StreamResponse, Token, Validation, }; use axum::extract::Extension; use axum::http::{HeaderMap, Method, StatusCode}; @@ -532,7 +532,7 @@ async fn generate_stream_internal( path = "/v1/chat/completions", request_body = ChatRequest, responses( - (status = 200, description = "Generated Text", body = GenerateResponse), + (status = 200, description = "Generated Text", body = ChatCompletionChunk), (status = 424, description = "Generation Error", body = ErrorResponse, example = json ! ({"error": "Request failed during generation"})), (status = 429, description = "Model is overloaded", body = ErrorResponse, @@ -672,6 +672,52 @@ async fn chat_completions( } } +/// Tokenize inputs +#[utoipa::path( + post, + tag = "Text Generation Inference", + path = "/tokenize", + request_body = TokenizeRequest, + responses( + (status = 200, description = "Tokenized ids", body = TokenizeResponse), + (status = 404, description = "No tokenizer found", body = ErrorResponse, + example = json ! ({"error": "No fast tokenizer available"})), + ) + )] +#[instrument(skip_all)] +async fn tokenize( + Extension(infer): Extension, + Json(req): Json, +) -> Result)> { + let input = req.inputs.clone(); + let encoding = infer.tokenize(req).await?; + if let Some(encoding) = encoding { + let tokens: Vec = encoding + .get_ids() + .iter() + .zip(encoding.get_offsets()) + .map(|(&id, &(start, stop))| { + let text: String = input.chars().skip(start).take(stop - start).collect(); + SimpleToken { + id, + text, + start, + stop, + } + }) + .collect(); + Ok(Json(tokens).into_response()) + } else { + Err(( + StatusCode::NOT_FOUND, + Json(ErrorResponse { + error: "No fast tokenizer or tokenizer.json for this model".to_string(), + error_type: "no fast tokenizer".to_string(), + }), + )) + } +} + /// Prometheus metrics scrape endpoint #[utoipa::path( get, @@ -719,6 +765,8 @@ pub async fn run( compat_generate, generate, generate_stream, + chat_completions, + tokenize, metrics, ), components( @@ -867,6 +915,7 @@ pub async fn run( .route("/generate", post(generate)) .route("/generate_stream", post(generate_stream)) .route("/v1/chat/completions", post(chat_completions)) + .route("/tokenize", post(tokenize)) .route("/health", get(health)) .route("/ping", get(health)) .route("/metrics", get(metrics)); diff --git a/router/src/validation.rs b/router/src/validation.rs index 370e9588..750b98e5 100644 --- a/router/src/validation.rs +++ b/router/src/validation.rs @@ -70,12 +70,11 @@ impl Validation { } #[instrument(skip(self, inputs))] - async fn validate_input( + pub async fn tokenize( &self, inputs: String, truncate: Option, - max_new_tokens: Option, - ) -> Result<(String, usize, u32), ValidationError> { + ) -> Result, ValidationError> { // If we have a fast tokenizer if let Some(sender) = &self.sender { // Create response channel @@ -88,7 +87,24 @@ impl Validation { // Await on response channel // Unwrap is safe here - let (inputs, input_length) = response_receiver.await.unwrap()?; + let encoding = response_receiver.await.unwrap()?; + Ok(Some(encoding)) + } else { + Ok(None) + } + } + + #[instrument(skip(self, inputs))] + async fn validate_input( + &self, + inputs: String, + truncate: Option, + max_new_tokens: Option, + ) -> Result<(String, usize, u32), ValidationError> { + // If we have a fast tokenizer + if let Some((encoding, inputs)) = self.tokenize(inputs.clone(), truncate).await? { + // Create response channel + let input_length = encoding.len(); // Get total tokens let max_new_tokens: u32 = if let Some(max_new_tokens) = max_new_tokens { @@ -343,36 +359,31 @@ fn tokenizer_worker(tokenizer: Tokenizer, mut receiver: mpsc::UnboundedReceiver< /// Get input length and optionally truncate it fn prepare_input( - inputs: String, + mut inputs: String, truncate: Option, tokenizer: &Tokenizer, -) -> Result<(String, usize), ValidationError> { +) -> Result<(tokenizers::Encoding, String), ValidationError> { // Get the number of tokens in the input let mut encoding = tokenizer .encode(inputs.clone(), true) .map_err(|err| ValidationError::Tokenizer(err.to_string()))?; // Optionally truncate - let (inputs, input_length) = match truncate { - // Truncate is some and < encoding length - Some(truncate) if truncate < encoding.len() => { - // truncate encoding and decode new inputs + if let Some(truncate) = truncate { + if truncate < encoding.len() { encoding.truncate(truncate, 0, TruncationDirection::Left); - let inputs = tokenizer + inputs = tokenizer .decode(encoding.get_ids(), false) .map_err(|err| ValidationError::Tokenizer(err.to_string()))?; - (inputs, encoding.len()) } - // Nothing to do - _ => (inputs, encoding.len()), - }; + } - Ok((inputs, input_length)) + Ok((encoding, inputs)) } type TokenizerRequest = ( (String, Option), - oneshot::Sender>, + oneshot::Sender>, Span, );