hf_text-generation-inference/docs/openapi.json

1375 lines
37 KiB
JSON

{
"openapi": "3.0.3",
"info": {
"title": "Text Generation Inference",
"description": "Text Generation Webserver",
"contact": {
"name": "Olivier Dehaene"
},
"license": {
"name": "Apache 2.0",
"url": "https://www.apache.org/licenses/LICENSE-2.0"
},
"version": "1.4.1"
},
"paths": {
"/": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
"description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
"operationId": "compat_generate",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CompatGenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateResponse"
}
},
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/StreamResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/generate": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens",
"description": "Generate tokens",
"operationId": "generate",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/generate_stream": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate a stream of token using Server-Sent Events",
"description": "Generate a stream of token using Server-Sent Events",
"operationId": "generate_stream",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/StreamResponse"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"text/event-stream": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
},
"/health": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Health check method",
"description": "Health check method",
"operationId": "health",
"responses": {
"200": {
"description": "Everything is working fine"
},
"503": {
"description": "Text generation inference is down",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "unhealthy",
"error_type": "healthcheck"
}
}
}
}
}
}
},
"/info": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Text Generation Inference endpoint info",
"description": "Text Generation Inference endpoint info",
"operationId": "get_model_info",
"responses": {
"200": {
"description": "Served model info",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Info"
}
}
}
}
}
}
},
"/metrics": {
"get": {
"tags": [
"Text Generation Inference"
],
"summary": "Prometheus metrics scrape endpoint",
"description": "Prometheus metrics scrape endpoint",
"operationId": "metrics",
"responses": {
"200": {
"description": "Prometheus Metrics",
"content": {
"text/plain": {
"schema": {
"type": "string"
}
}
}
}
}
}
},
"/tokenize": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Tokenize inputs",
"description": "Tokenize inputs",
"operationId": "tokenize",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/GenerateRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Tokenized ids",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/TokenizeResponse"
}
}
}
},
"404": {
"description": "No tokenizer found",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "No fast tokenizer available"
}
}
}
}
}
}
},
"/v1/chat/completions": {
"post": {
"tags": [
"Text Generation Inference"
],
"summary": "Generate tokens",
"description": "Generate tokens",
"operationId": "chat_completions",
"requestBody": {
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatRequest"
}
}
},
"required": true
},
"responses": {
"200": {
"description": "Generated Text",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ChatCompletionChunk"
}
}
}
},
"422": {
"description": "Input validation error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Input validation error"
}
}
}
},
"424": {
"description": "Generation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Request failed during generation"
}
}
}
},
"429": {
"description": "Model is overloaded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Model is overloaded"
}
}
}
},
"500": {
"description": "Incomplete generation",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ErrorResponse"
},
"example": {
"error": "Incomplete generation"
}
}
}
}
}
}
}
},
"components": {
"schemas": {
"BestOfSequence": {
"type": "object",
"required": [
"generated_text",
"finish_reason",
"generated_tokens",
"prefill",
"tokens"
],
"properties": {
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_text": {
"type": "string",
"example": "test"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"prefill": {
"type": "array",
"items": {
"$ref": "#/components/schemas/PrefillToken"
}
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
},
"top_tokens": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
}
},
"ChatCompletion": {
"type": "object",
"required": [
"id",
"object",
"created",
"model",
"system_fingerprint",
"choices",
"usage"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionComplete"
}
},
"created": {
"type": "integer",
"format": "int64",
"example": "1706270835",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"object": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
},
"usage": {
"$ref": "#/components/schemas/Usage"
}
}
},
"ChatCompletionChoice": {
"type": "object",
"required": [
"index",
"delta"
],
"properties": {
"delta": {
"$ref": "#/components/schemas/ChatCompletionDelta"
},
"finish_reason": {
"type": "string",
"nullable": true
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"logprobs": {
"allOf": [
{
"$ref": "#/components/schemas/ChatCompletionLogprobs"
}
],
"nullable": true
}
}
},
"ChatCompletionChunk": {
"type": "object",
"required": [
"id",
"object",
"created",
"model",
"system_fingerprint",
"choices"
],
"properties": {
"choices": {
"type": "array",
"items": {
"$ref": "#/components/schemas/ChatCompletionChoice"
}
},
"created": {
"type": "integer",
"format": "int64",
"example": "1706270978",
"minimum": 0
},
"id": {
"type": "string"
},
"model": {
"type": "string",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"object": {
"type": "string"
},
"system_fingerprint": {
"type": "string"
}
}
},
"ChatCompletionDelta": {
"type": "object",
"required": [
"role",
"content"
],
"properties": {
"content": {
"type": "string",
"example": "What is Deep Learning?"
},
"role": {
"type": "string",
"example": "user"
}
}
},
"ChatRequest": {
"type": "object",
"required": [
"model"
],
"properties": {
"frequency_penalty": {
"type": "number",
"format": "float",
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
"example": "1.0",
"nullable": true
},
"logit_bias": {
"type": "array",
"items": {
"type": "number",
"format": "float"
},
"description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
"nullable": true
},
"logprobs": {
"type": "boolean",
"description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
"example": "false",
"nullable": true
},
"max_tokens": {
"type": "integer",
"format": "int32",
"description": "The maximum number of tokens that can be generated in the chat completion.",
"example": "32",
"nullable": true,
"minimum": 0
},
"messages": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Message"
},
"description": "A list of messages comprising the conversation so far."
},
"model": {
"type": "string",
"description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
"example": "mistralai/Mistral-7B-Instruct-v0.2"
},
"n": {
"type": "integer",
"format": "int32",
"description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
"example": "2",
"nullable": true,
"minimum": 0
},
"presence_penalty": {
"type": "number",
"format": "float",
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
"example": 0.1,
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"stream": {
"type": "boolean"
},
"temperature": {
"type": "number",
"format": "float",
"description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
"example": 1.0,
"nullable": true
},
"top_logprobs": {
"type": "integer",
"format": "int32",
"description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
"example": "5",
"nullable": true,
"minimum": 0
},
"top_p": {
"type": "number",
"format": "float",
"description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
"example": 0.95,
"nullable": true
}
}
},
"CompatGenerateRequest": {
"type": "object",
"required": [
"inputs"
],
"properties": {
"inputs": {
"type": "string",
"example": "My name is Olivier and I"
},
"parameters": {
"$ref": "#/components/schemas/GenerateParameters"
},
"stream": {
"type": "boolean",
"default": "false"
}
}
},
"Details": {
"type": "object",
"required": [
"finish_reason",
"generated_tokens",
"prefill",
"tokens"
],
"properties": {
"best_of_sequences": {
"type": "array",
"items": {
"$ref": "#/components/schemas/BestOfSequence"
},
"nullable": true
},
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"prefill": {
"type": "array",
"items": {
"$ref": "#/components/schemas/PrefillToken"
}
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
},
"tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
},
"top_tokens": {
"type": "array",
"items": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
}
},
"ErrorResponse": {
"type": "object",
"required": [
"error",
"error_type"
],
"properties": {
"error": {
"type": "string"
},
"error_type": {
"type": "string"
}
}
},
"FinishReason": {
"type": "string",
"enum": [
"length",
"eos_token",
"stop_sequence"
],
"example": "Length"
},
"GenerateParameters": {
"type": "object",
"properties": {
"best_of": {
"type": "integer",
"default": "null",
"example": 1,
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"decoder_input_details": {
"type": "boolean",
"default": "true"
},
"details": {
"type": "boolean",
"default": "true"
},
"do_sample": {
"type": "boolean",
"default": "false",
"example": true
},
"frequency_penalty": {
"type": "number",
"format": "float",
"default": "null",
"example": 0.1,
"nullable": true,
"exclusiveMinimum": -2
},
"grammar": {
"allOf": [
{
"$ref": "#/components/schemas/GrammarType"
}
],
"nullable": true
},
"max_new_tokens": {
"type": "integer",
"format": "int32",
"default": "100",
"example": "20",
"nullable": true,
"minimum": 0
},
"repetition_penalty": {
"type": "number",
"format": "float",
"default": "null",
"example": 1.03,
"nullable": true,
"exclusiveMinimum": 0
},
"return_full_text": {
"type": "boolean",
"default": "null",
"example": false,
"nullable": true
},
"seed": {
"type": "integer",
"format": "int64",
"default": "null",
"example": "null",
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"example": [
"photographer"
],
"maxItems": 4
},
"temperature": {
"type": "number",
"format": "float",
"default": "null",
"example": 0.5,
"nullable": true,
"exclusiveMinimum": 0
},
"top_k": {
"type": "integer",
"format": "int32",
"default": "null",
"example": 10,
"nullable": true,
"exclusiveMinimum": 0
},
"top_n_tokens": {
"type": "integer",
"format": "int32",
"default": "null",
"example": 5,
"nullable": true,
"minimum": 0,
"exclusiveMinimum": 0
},
"top_p": {
"type": "number",
"format": "float",
"default": "null",
"example": 0.95,
"nullable": true,
"maximum": 1,
"exclusiveMinimum": 0
},
"truncate": {
"type": "integer",
"default": "null",
"example": "null",
"nullable": true,
"minimum": 0
},
"typical_p": {
"type": "number",
"format": "float",
"default": "null",
"example": 0.95,
"nullable": true,
"maximum": 1,
"exclusiveMinimum": 0
},
"watermark": {
"type": "boolean",
"default": "false",
"example": true
}
}
},
"GenerateRequest": {
"type": "object",
"required": [
"inputs"
],
"properties": {
"inputs": {
"type": "string",
"example": "My name is Olivier and I"
},
"parameters": {
"$ref": "#/components/schemas/GenerateParameters"
}
}
},
"GenerateResponse": {
"type": "object",
"required": [
"generated_text"
],
"properties": {
"details": {
"allOf": [
{
"$ref": "#/components/schemas/Details"
}
],
"nullable": true
},
"generated_text": {
"type": "string",
"example": "test"
}
}
},
"GrammarType": {
"oneOf": [
{
"type": "object",
"required": [
"type",
"value"
],
"properties": {
"type": {
"type": "string",
"enum": [
"json"
]
},
"value": {
"type": "string",
"description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions.",
"example": {
"properties": {
"location": {
"type": "string"
}
}
}
}
}
},
{
"type": "object",
"required": [
"type",
"value"
],
"properties": {
"type": {
"type": "string",
"enum": [
"regex"
]
},
"value": {
"type": "string"
}
}
}
],
"discriminator": {
"propertyName": "type"
}
},
"Info": {
"type": "object",
"required": [
"model_id",
"model_dtype",
"model_device_type",
"max_concurrent_requests",
"max_best_of",
"max_stop_sequences",
"max_input_length",
"max_total_tokens",
"waiting_served_ratio",
"max_batch_total_tokens",
"max_waiting_tokens",
"validation_workers",
"version"
],
"properties": {
"docker_label": {
"type": "string",
"example": "null",
"nullable": true
},
"max_batch_size": {
"type": "integer",
"example": "null",
"nullable": true,
"minimum": 0
},
"max_batch_total_tokens": {
"type": "integer",
"format": "int32",
"example": "32000",
"minimum": 0
},
"max_best_of": {
"type": "integer",
"example": "2",
"minimum": 0
},
"max_concurrent_requests": {
"type": "integer",
"description": "Router Parameters",
"example": "128",
"minimum": 0
},
"max_input_length": {
"type": "integer",
"example": "1024",
"minimum": 0
},
"max_stop_sequences": {
"type": "integer",
"example": "4",
"minimum": 0
},
"max_total_tokens": {
"type": "integer",
"example": "2048",
"minimum": 0
},
"max_waiting_tokens": {
"type": "integer",
"example": "20",
"minimum": 0
},
"model_device_type": {
"type": "string",
"example": "cuda"
},
"model_dtype": {
"type": "string",
"example": "torch.float16"
},
"model_id": {
"type": "string",
"description": "Model info",
"example": "bigscience/blomm-560m"
},
"model_pipeline_tag": {
"type": "string",
"example": "text-generation",
"nullable": true
},
"model_sha": {
"type": "string",
"example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
"nullable": true
},
"sha": {
"type": "string",
"example": "null",
"nullable": true
},
"validation_workers": {
"type": "integer",
"example": "2",
"minimum": 0
},
"version": {
"type": "string",
"description": "Router Info",
"example": "0.5.0"
},
"waiting_served_ratio": {
"type": "number",
"format": "float",
"example": "1.2"
}
}
},
"Message": {
"type": "object",
"required": [
"role",
"content"
],
"properties": {
"content": {
"type": "string",
"example": "My name is David and I"
},
"name": {
"type": "string",
"example": "\"David\"",
"nullable": true
},
"role": {
"type": "string",
"example": "user"
}
}
},
"PrefillToken": {
"type": "object",
"required": [
"id",
"text",
"logprob"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"logprob": {
"type": "number",
"format": "float",
"example": -0.34,
"nullable": true
},
"text": {
"type": "string",
"example": "test"
}
}
},
"SimpleToken": {
"type": "object",
"required": [
"id",
"text",
"start",
"stop"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"start": {
"type": "integer",
"example": 0,
"minimum": 0
},
"stop": {
"type": "integer",
"example": 2,
"minimum": 0
},
"text": {
"type": "string",
"example": "test"
}
}
},
"StreamDetails": {
"type": "object",
"required": [
"finish_reason",
"generated_tokens"
],
"properties": {
"finish_reason": {
"$ref": "#/components/schemas/FinishReason"
},
"generated_tokens": {
"type": "integer",
"format": "int32",
"example": 1,
"minimum": 0
},
"seed": {
"type": "integer",
"format": "int64",
"example": 42,
"nullable": true,
"minimum": 0
}
}
},
"StreamResponse": {
"type": "object",
"required": [
"index",
"token"
],
"properties": {
"details": {
"allOf": [
{
"$ref": "#/components/schemas/StreamDetails"
}
],
"default": "null",
"nullable": true
},
"generated_text": {
"type": "string",
"default": "null",
"example": "test",
"nullable": true
},
"index": {
"type": "integer",
"format": "int32",
"minimum": 0
},
"token": {
"$ref": "#/components/schemas/Token"
},
"top_tokens": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Token"
}
}
}
},
"Token": {
"type": "object",
"required": [
"id",
"text",
"logprob",
"special"
],
"properties": {
"id": {
"type": "integer",
"format": "int32",
"example": 0,
"minimum": 0
},
"logprob": {
"type": "number",
"format": "float",
"example": -0.34,
"nullable": true
},
"special": {
"type": "boolean",
"example": "false"
},
"text": {
"type": "string",
"example": "test"
}
}
},
"TokenizeResponse": {
"type": "array",
"items": {
"$ref": "#/components/schemas/SimpleToken"
}
}
}
},
"tags": [
{
"name": "Text Generation Inference",
"description": "Hugging Face Text Generation Inference API"
}
]
}