hf_text-generation-inference/docs/openapi.json

{
  "openapi": "3.0.3",
  "info": {
    "title": "Text Generation Inference",
    "description": "Text Generation Webserver",
    "contact": {
      "name": "Olivier Dehaene"
    },
    "license": {
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
    "version": "1.4.3"
  },
  "paths": {
    "/": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
        "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
        "operationId": "compat_generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/CompatGenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/GenerateResponse"
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/generate": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "description": "Generate tokens",
        "operationId": "generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/GenerateResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/generate_stream": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate a stream of token using Server-Sent Events",
        "description": "Generate a stream of token using Server-Sent Events",
        "operationId": "generate_stream",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/health": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Health check method",
        "description": "Health check method",
        "operationId": "health",
        "responses": {
          "200": {
            "description": "Everything is working fine"
          },
          "503": {
            "description": "Text generation inference is down",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "unhealthy",
                  "error_type": "healthcheck"
                }
              }
            }
          }
        }
      }
    },
    "/info": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Text Generation Inference endpoint info",
        "description": "Text Generation Inference endpoint info",
        "operationId": "get_model_info",
        "responses": {
          "200": {
            "description": "Served model info",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/Info"
                }
              }
            }
          }
        }
      }
    },
    "/metrics": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Prometheus metrics scrape endpoint",
        "description": "Prometheus metrics scrape endpoint",
        "operationId": "metrics",
        "responses": {
          "200": {
            "description": "Prometheus Metrics",
            "content": {
              "text/plain": {
                "schema": {
                  "type": "string"
                }
              }
            }
          }
        }
      }
    },
    "/tokenize": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Tokenize inputs",
        "description": "Tokenize inputs",
        "operationId": "tokenize",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Tokenized ids",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/TokenizeResponse"
                }
              }
            }
          },
          "404": {
            "description": "No tokenizer found",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "No fast tokenizer available"
                }
              }
            }
          }
        }
      }
    },
    "/v1/chat/completions": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "description": "Generate tokens",
        "operationId": "chat_completions",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/ChatRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ChatCompletionChunk"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "BestOfSequence": {
        "type": "object",
        "required": [
          "generated_text",
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "ChatCompletion": {
        "type": "object",
        "required": [
          "id",
          "object",
          "created",
          "model",
          "system_fingerprint",
          "choices",
          "usage"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionComplete"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "example": "1706270835",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "object": {
            "type": "string"
          },
          "system_fingerprint": {
            "type": "string"
          },
          "usage": {
            "$ref": "#/components/schemas/Usage"
          }
        }
      },
      "ChatCompletionChoice": {
        "type": "object",
        "required": [
          "index",
          "delta"
        ],
        "properties": {
          "delta": {
            "$ref": "#/components/schemas/ChatCompletionDelta"
          },
          "finish_reason": {
            "type": "string",
            "nullable": true
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "logprobs": {
            "allOf": [
              {
                "$ref": "#/components/schemas/ChatCompletionLogprobs"
              }
            ],
            "nullable": true
          }
        }
      },
      "ChatCompletionChunk": {
        "type": "object",
        "required": [
          "id",
          "object",
          "created",
          "model",
          "system_fingerprint",
          "choices"
        ],
        "properties": {
          "choices": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionChoice"
            }
          },
          "created": {
            "type": "integer",
            "format": "int64",
            "example": "1706270978",
            "minimum": 0
          },
          "id": {
            "type": "string"
          },
          "model": {
            "type": "string",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "object": {
            "type": "string"
          },
          "system_fingerprint": {
            "type": "string"
          }
        }
      },
      "ChatCompletionComplete": {
        "type": "object",
        "required": [
          "index",
          "message",
          "finish_reason"
        ],
        "properties": {
          "finish_reason": {
            "type": "string"
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "logprobs": {
            "allOf": [
              {
                "$ref": "#/components/schemas/ChatCompletionLogprobs"
              }
            ],
            "nullable": true
          },
          "message": {
            "$ref": "#/components/schemas/Message"
          }
        }
      },
      "ChatCompletionDelta": {
        "type": "object",
        "required": [
          "role",
          "content"
        ],
        "properties": {
          "content": {
            "type": "string",
            "example": "What is Deep Learning?"
          },
          "role": {
            "type": "string",
            "example": "user"
          }
        }
      },
      "ChatCompletionLogprob": {
        "type": "object",
        "required": [
          "token",
          "logprob",
          "top_logprobs"
        ],
        "properties": {
          "logprob": {
            "type": "number",
            "format": "float"
          },
          "token": {
            "type": "string"
          },
          "top_logprobs": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionTopLogprob"
            }
          }
        }
      },
      "ChatCompletionLogprobs": {
        "type": "object",
        "required": [
          "content"
        ],
        "properties": {
          "content": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/ChatCompletionLogprob"
            }
          }
        }
      },
      "ChatCompletionTopLogprob": {
        "type": "object",
        "required": [
          "token",
          "logprob"
        ],
        "properties": {
          "logprob": {
            "type": "number",
            "format": "float"
          },
          "token": {
            "type": "string"
          }
        }
      },
      "ChatRequest": {
        "type": "object",
        "required": [
          "model"
        ],
        "properties": {
          "frequency_penalty": {
            "type": "number",
            "format": "float",
            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim.",
            "example": "1.0",
            "nullable": true
          },
          "logit_bias": {
            "type": "array",
            "items": {
              "type": "number",
              "format": "float"
            },
            "description": "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token.",
            "nullable": true
          },
          "logprobs": {
            "type": "boolean",
            "description": "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message.",
            "example": "false",
            "nullable": true
          },
          "max_tokens": {
            "type": "integer",
            "format": "int32",
            "description": "The maximum number of tokens that can be generated in the chat completion.",
            "example": "32",
            "nullable": true,
            "minimum": 0
          },
          "messages": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Message"
            },
            "description": "A list of messages comprising the conversation so far."
          },
          "model": {
            "type": "string",
            "description": "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.",
            "example": "mistralai/Mistral-7B-Instruct-v0.2"
          },
          "n": {
            "type": "integer",
            "format": "int32",
            "description": "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs.",
            "example": "2",
            "nullable": true,
            "minimum": 0
          },
          "presence_penalty": {
            "type": "number",
            "format": "float",
            "description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics",
            "example": 0.1,
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "stream": {
            "type": "boolean"
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "description": "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both.",
            "example": 1.0,
            "nullable": true
          },
          "top_logprobs": {
            "type": "integer",
            "format": "int32",
            "description": "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used.",
            "example": "5",
            "nullable": true,
            "minimum": 0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "description": "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.",
            "example": 0.95,
            "nullable": true
          }
        }
      },
      "CompatGenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          },
          "stream": {
            "type": "boolean",
            "default": "false"
          }
        }
      },
      "Details": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "best_of_sequences": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/BestOfSequence"
            },
            "nullable": true
          },
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "ErrorResponse": {
        "type": "object",
        "required": [
          "error",
          "error_type"
        ],
        "properties": {
          "error": {
            "type": "string"
          },
          "error_type": {
            "type": "string"
          }
        }
      },
      "FinishReason": {
        "type": "string",
        "enum": [
          "length",
          "eos_token",
          "stop_sequence"
        ],
        "example": "Length"
      },
      "GenerateParameters": {
        "type": "object",
        "properties": {
          "best_of": {
            "type": "integer",
            "default": "null",
            "example": 1,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "decoder_input_details": {
            "type": "boolean",
            "default": "true"
          },
          "details": {
            "type": "boolean",
            "default": "true"
          },
          "do_sample": {
            "type": "boolean",
            "default": "false",
            "example": true
          },
          "frequency_penalty": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.1,
            "nullable": true,
            "exclusiveMinimum": -2
          },
          "grammar": {
            "allOf": [
              {
                "$ref": "#/components/schemas/GrammarType"
              }
            ],
            "nullable": true
          },
          "max_new_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "100",
            "example": "20",
            "nullable": true,
            "minimum": 0
          },
          "repetition_penalty": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 1.03,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "return_full_text": {
            "type": "boolean",
            "default": "null",
            "example": false,
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "example": [
              "photographer"
            ],
            "maxItems": 4
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.5,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_k": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 10,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_n_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 5,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "truncate": {
            "type": "integer",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0
          },
          "typical_p": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "watermark": {
            "type": "boolean",
            "default": "false",
            "example": true
          }
        }
      },
      "GenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          }
        }
      },
      "GenerateResponse": {
        "type": "object",
        "required": [
          "generated_text"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/Details"
              }
            ],
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "GrammarType": {
        "oneOf": [
          {
            "type": "object",
            "required": [
              "type",
              "value"
            ],
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "json"
                ]
              },
              "value": {
                "description": "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
              }
            }
          },
          {
            "type": "object",
            "required": [
              "type",
              "value"
            ],
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "regex"
                ]
              },
              "value": {
                "type": "string"
              }
            }
          }
        ],
        "discriminator": {
          "propertyName": "type"
        }
      },
      "Info": {
        "type": "object",
        "required": [
          "model_id",
          "model_dtype",
          "model_device_type",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_length",
          "max_total_tokens",
          "waiting_served_ratio",
          "max_batch_total_tokens",
          "max_waiting_tokens",
          "validation_workers",
          "version"
        ],
        "properties": {
          "docker_label": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "max_batch_size": {
            "type": "integer",
            "example": "null",
            "nullable": true,
            "minimum": 0
          },
          "max_batch_total_tokens": {
            "type": "integer",
            "format": "int32",
            "example": "32000",
            "minimum": 0
          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "max_concurrent_requests": {
            "type": "integer",
            "description": "Router Parameters",
            "example": "128",
            "minimum": 0
          },
          "max_input_length": {
            "type": "integer",
            "example": "1024",
            "minimum": 0
          },
          "max_stop_sequences": {
            "type": "integer",
            "example": "4",
            "minimum": 0
          },
          "max_total_tokens": {
            "type": "integer",
            "example": "2048",
            "minimum": 0
          },
          "max_waiting_tokens": {
            "type": "integer",
            "example": "20",
            "minimum": 0
          },
          "model_device_type": {
            "type": "string",
            "example": "cuda"
          },
          "model_dtype": {
            "type": "string",
            "example": "torch.float16"
          },
          "model_id": {
            "type": "string",
            "description": "Model info",
            "example": "bigscience/blomm-560m"
          },
          "model_pipeline_tag": {
            "type": "string",
            "example": "text-generation",
            "nullable": true
          },
          "model_sha": {
            "type": "string",
            "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
            "nullable": true
          },
          "sha": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "validation_workers": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "version": {
            "type": "string",
            "description": "Router Info",
            "example": "0.5.0"
          },
          "waiting_served_ratio": {
            "type": "number",
            "format": "float",
            "example": "1.2"
          }
        }
      },
      "Message": {
        "type": "object",
        "required": [
          "role",
          "content"
        ],
        "properties": {
          "content": {
            "type": "string",
            "example": "My name is David and I"
          },
          "name": {
            "type": "string",
            "example": "\"David\"",
            "nullable": true
          },
          "role": {
            "type": "string",
            "example": "user"
          }
        }
      },
      "PrefillToken": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "SimpleToken": {
        "type": "object",
        "required": [
          "id",
          "text",
          "start",
          "stop"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "start": {
            "type": "integer",
            "example": 0,
            "minimum": 0
          },
          "stop": {
            "type": "integer",
            "example": 2,
            "minimum": 0
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "StreamDetails": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          }
        }
      },
      "StreamResponse": {
        "type": "object",
        "required": [
          "index",
          "token"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/StreamDetails"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "default": "null",
            "example": "test",
            "nullable": true
          },
          "index": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "token": {
            "$ref": "#/components/schemas/Token"
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          }
        }
      },
      "Token": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob",
          "special"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "special": {
            "type": "boolean",
            "example": "false"
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "TokenizeResponse": {
        "type": "array",
        "items": {
          "$ref": "#/components/schemas/SimpleToken"
        }
      },
      "Usage": {
        "type": "object",
        "required": [
          "prompt_tokens",
          "completion_tokens",
          "total_tokens"
        ],
        "properties": {
          "completion_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "prompt_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          },
          "total_tokens": {
            "type": "integer",
            "format": "int32",
            "minimum": 0
          }
        }
      }
    }
  },
  "tags": [
    {
      "name": "Text Generation Inference",
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
}