2024-01-26 11:04:57 -07:00
{
"openapi" : "3.0.3" ,
"info" : {
"title" : "Text Generation Inference" ,
"description" : "Text Generation Webserver" ,
"contact" : {
"name" : "Olivier Dehaene"
} ,
"license" : {
"name" : "Apache 2.0" ,
"url" : "https://www.apache.org/licenses/LICENSE-2.0"
} ,
2024-07-05 02:29:56 -06:00
"version" : "2.1.2-dev0"
2024-01-26 11:04:57 -07:00
} ,
"paths" : {
"/" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Generate tokens if `stream == false` or a stream of token if `stream == true`" ,
"operationId" : "compat_generate" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/CompatGenerateRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
"description" : "Generated Text" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/GenerateResponse"
}
} ,
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/StreamResponse"
}
}
}
} ,
"422" : {
"description" : "Input validation error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Input validation error"
}
}
}
} ,
"424" : {
"description" : "Generation Error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Request failed during generation"
}
}
}
} ,
"429" : {
"description" : "Model is overloaded" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Model is overloaded"
}
}
}
} ,
"500" : {
"description" : "Incomplete generation" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Incomplete generation"
}
}
}
}
}
}
} ,
"/generate" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Generate tokens" ,
"operationId" : "generate" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/GenerateRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
"description" : "Generated Text" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/GenerateResponse"
}
}
}
} ,
"422" : {
"description" : "Input validation error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Input validation error"
}
}
}
} ,
"424" : {
"description" : "Generation Error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Request failed during generation"
}
}
}
} ,
"429" : {
"description" : "Model is overloaded" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Model is overloaded"
}
}
}
} ,
"500" : {
"description" : "Incomplete generation" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Incomplete generation"
}
}
}
}
}
}
} ,
"/generate_stream" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Generate a stream of token using Server-Sent Events" ,
"operationId" : "generate_stream" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/GenerateRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
"description" : "Generated Text" ,
"content" : {
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/StreamResponse"
}
}
}
} ,
"422" : {
"description" : "Input validation error" ,
"content" : {
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Input validation error"
}
}
}
} ,
"424" : {
"description" : "Generation Error" ,
"content" : {
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Request failed during generation"
}
}
}
} ,
"429" : {
"description" : "Model is overloaded" ,
"content" : {
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Model is overloaded"
}
}
}
} ,
"500" : {
"description" : "Incomplete generation" ,
"content" : {
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Incomplete generation"
}
}
}
}
}
}
} ,
"/health" : {
"get" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Health check method" ,
"operationId" : "health" ,
"responses" : {
"200" : {
"description" : "Everything is working fine"
} ,
"503" : {
"description" : "Text generation inference is down" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "unhealthy" ,
"error_type" : "healthcheck"
}
}
}
}
}
}
} ,
"/info" : {
"get" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Text Generation Inference endpoint info" ,
"operationId" : "get_model_info" ,
"responses" : {
"200" : {
"description" : "Served model info" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/Info"
}
}
}
}
}
}
} ,
"/metrics" : {
"get" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Prometheus metrics scrape endpoint" ,
"operationId" : "metrics" ,
"responses" : {
"200" : {
"description" : "Prometheus Metrics" ,
"content" : {
"text/plain" : {
"schema" : {
"type" : "string"
}
}
}
}
}
}
} ,
"/tokenize" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Tokenize inputs" ,
"operationId" : "tokenize" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/GenerateRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
"description" : "Tokenized ids" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/TokenizeResponse"
}
}
}
} ,
"404" : {
"description" : "No tokenizer found" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "No fast tokenizer available"
}
}
}
}
}
}
} ,
"/v1/chat/completions" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Generate tokens" ,
"operationId" : "chat_completions" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ChatRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
2024-04-18 09:17:40 -06:00
"description" : "Generated Chat Completion" ,
2024-01-26 11:04:57 -07:00
"content" : {
"application/json" : {
2024-04-18 09:17:40 -06:00
"schema" : {
"$ref" : "#/components/schemas/ChatCompletion"
}
} ,
"text/event-stream" : {
2024-01-26 11:04:57 -07:00
"schema" : {
"$ref" : "#/components/schemas/ChatCompletionChunk"
}
}
}
} ,
"422" : {
"description" : "Input validation error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Input validation error"
}
}
}
} ,
"424" : {
"description" : "Generation Error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Request failed during generation"
}
}
}
} ,
"429" : {
"description" : "Model is overloaded" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Model is overloaded"
}
}
}
} ,
"500" : {
"description" : "Incomplete generation" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Incomplete generation"
}
}
}
}
}
}
2024-03-29 12:17:24 -06:00
} ,
"/v1/completions" : {
"post" : {
"tags" : [
"Text Generation Inference"
] ,
"summary" : "Generate tokens" ,
"operationId" : "completions" ,
"requestBody" : {
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/CompletionRequest"
}
}
} ,
"required" : true
} ,
"responses" : {
"200" : {
2024-04-18 09:17:40 -06:00
"description" : "Generated Chat Completion" ,
2024-03-29 12:17:24 -06:00
"content" : {
"application/json" : {
"schema" : {
2024-04-18 09:17:40 -06:00
"$ref" : "#/components/schemas/Completion"
}
} ,
"text/event-stream" : {
"schema" : {
"$ref" : "#/components/schemas/CompletionCompleteChunk"
2024-03-29 12:17:24 -06:00
}
}
}
} ,
"422" : {
"description" : "Input validation error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Input validation error"
}
}
}
} ,
"424" : {
"description" : "Generation Error" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Request failed during generation"
}
}
}
} ,
"429" : {
"description" : "Model is overloaded" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Model is overloaded"
}
}
}
} ,
"500" : {
"description" : "Incomplete generation" ,
"content" : {
"application/json" : {
"schema" : {
"$ref" : "#/components/schemas/ErrorResponse"
} ,
"example" : {
"error" : "Incomplete generation"
}
}
}
}
}
}
2024-01-26 11:04:57 -07:00
}
} ,
"components" : {
"schemas" : {
"BestOfSequence" : {
"type" : "object" ,
"required" : [
"generated_text" ,
"finish_reason" ,
"generated_tokens" ,
"prefill" ,
"tokens"
] ,
"properties" : {
"finish_reason" : {
"$ref" : "#/components/schemas/FinishReason"
} ,
"generated_text" : {
"type" : "string" ,
"example" : "test"
} ,
"generated_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 1 ,
"minimum" : 0
} ,
"prefill" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/PrefillToken"
}
} ,
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : 42 ,
"nullable" : true ,
"minimum" : 0
} ,
"tokens" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Token"
}
} ,
"top_tokens" : {
"type" : "array" ,
"items" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Token"
}
}
}
}
} ,
"ChatCompletion" : {
"type" : "object" ,
"required" : [
"id" ,
"created" ,
"model" ,
"system_fingerprint" ,
"choices" ,
"usage"
] ,
"properties" : {
"choices" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/ChatCompletionComplete"
}
} ,
"created" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : "1706270835" ,
"minimum" : 0
} ,
"id" : {
"type" : "string"
} ,
"model" : {
"type" : "string" ,
"example" : "mistralai/Mistral-7B-Instruct-v0.2"
} ,
"system_fingerprint" : {
"type" : "string"
} ,
"usage" : {
"$ref" : "#/components/schemas/Usage"
}
}
} ,
"ChatCompletionChoice" : {
"type" : "object" ,
"required" : [
"index" ,
"delta"
] ,
"properties" : {
"delta" : {
"$ref" : "#/components/schemas/ChatCompletionDelta"
} ,
"finish_reason" : {
"type" : "string" ,
"nullable" : true
} ,
"index" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"logprobs" : {
2024-02-16 09:50:57 -07:00
"allOf" : [
{
"$ref" : "#/components/schemas/ChatCompletionLogprobs"
}
] ,
2024-01-26 11:04:57 -07:00
"nullable" : true
}
}
} ,
"ChatCompletionChunk" : {
"type" : "object" ,
"required" : [
"id" ,
"created" ,
"model" ,
"system_fingerprint" ,
"choices"
] ,
"properties" : {
"choices" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/ChatCompletionChoice"
}
} ,
"created" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : "1706270978" ,
"minimum" : 0
} ,
"id" : {
"type" : "string"
} ,
"model" : {
"type" : "string" ,
"example" : "mistralai/Mistral-7B-Instruct-v0.2"
} ,
"system_fingerprint" : {
"type" : "string"
}
}
} ,
2024-02-21 07:30:45 -07:00
"ChatCompletionComplete" : {
"type" : "object" ,
"required" : [
"index" ,
"message" ,
"finish_reason"
] ,
"properties" : {
"finish_reason" : {
"type" : "string"
} ,
"index" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"logprobs" : {
"allOf" : [
{
"$ref" : "#/components/schemas/ChatCompletionLogprobs"
}
] ,
"nullable" : true
} ,
"message" : {
2024-07-03 01:53:35 -06:00
"$ref" : "#/components/schemas/OutputMessage"
2024-02-21 07:30:45 -07:00
}
}
} ,
2024-01-26 11:04:57 -07:00
"ChatCompletionDelta" : {
2024-07-03 01:53:35 -06:00
"oneOf" : [
{
"$ref" : "#/components/schemas/TextMessage"
2024-03-29 12:17:24 -06:00
} ,
2024-07-03 01:53:35 -06:00
{
"$ref" : "#/components/schemas/ToolCallDelta"
2024-01-26 11:04:57 -07:00
}
2024-07-03 01:53:35 -06:00
]
2024-01-26 11:04:57 -07:00
} ,
2024-02-21 07:30:45 -07:00
"ChatCompletionLogprob" : {
"type" : "object" ,
"required" : [
"token" ,
"logprob" ,
"top_logprobs"
] ,
"properties" : {
"logprob" : {
"type" : "number" ,
"format" : "float"
} ,
"token" : {
"type" : "string"
} ,
"top_logprobs" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/ChatCompletionTopLogprob"
}
}
}
} ,
"ChatCompletionLogprobs" : {
"type" : "object" ,
"required" : [
"content"
] ,
"properties" : {
"content" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/ChatCompletionLogprob"
}
}
}
} ,
"ChatCompletionTopLogprob" : {
"type" : "object" ,
"required" : [
"token" ,
"logprob"
] ,
"properties" : {
"logprob" : {
"type" : "number" ,
"format" : "float"
} ,
"token" : {
"type" : "string"
}
}
} ,
2024-01-26 11:04:57 -07:00
"ChatRequest" : {
"type" : "object" ,
"required" : [
2024-03-29 12:17:24 -06:00
"model" ,
"messages"
2024-01-26 11:04:57 -07:00
] ,
"properties" : {
"frequency_penalty" : {
"type" : "number" ,
"format" : "float" ,
"description" : "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim." ,
"example" : "1.0" ,
"nullable" : true
} ,
"logit_bias" : {
"type" : "array" ,
"items" : {
"type" : "number" ,
"format" : "float"
} ,
"description" : "UNUSED\nModify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens\n(specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,\nthe bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,\nbut values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should\nresult in a ban or exclusive selection of the relevant token." ,
"nullable" : true
} ,
"logprobs" : {
"type" : "boolean" ,
"description" : "Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each\noutput token returned in the content of message." ,
"example" : "false" ,
"nullable" : true
} ,
"max_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"description" : "The maximum number of tokens that can be generated in the chat completion." ,
"example" : "32" ,
"nullable" : true ,
"minimum" : 0
} ,
"messages" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Message"
} ,
2024-03-29 12:17:24 -06:00
"description" : "A list of messages comprising the conversation so far." ,
"example" : "[{\"role\": \"user\", \"content\": \"What is Deep Learning?\"}]"
2024-01-26 11:04:57 -07:00
} ,
"model" : {
"type" : "string" ,
2024-03-29 12:17:24 -06:00
"description" : "[UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API." ,
2024-01-26 11:04:57 -07:00
"example" : "mistralai/Mistral-7B-Instruct-v0.2"
} ,
"n" : {
"type" : "integer" ,
"format" : "int32" ,
"description" : "UNUSED\nHow many chat completion choices to generate for each input message. Note that you will be charged based on the\nnumber of generated tokens across all of the choices. Keep n as 1 to minimize costs." ,
"example" : "2" ,
"nullable" : true ,
"minimum" : 0
} ,
"presence_penalty" : {
"type" : "number" ,
"format" : "float" ,
2024-02-16 09:50:57 -07:00
"description" : "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,\nincreasing the model's likelihood to talk about new topics" ,
2024-01-26 11:04:57 -07:00
"example" : 0.1 ,
"nullable" : true
} ,
2024-07-03 01:53:35 -06:00
"response_format" : {
"allOf" : [
{
"$ref" : "#/components/schemas/GrammarType"
}
] ,
"default" : "null" ,
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : 42 ,
"nullable" : true ,
"minimum" : 0
} ,
2024-03-29 12:17:24 -06:00
"stop" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "Up to 4 sequences where the API will stop generating further tokens." ,
"example" : "null" ,
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"stream" : {
"type" : "boolean"
} ,
"temperature" : {
"type" : "number" ,
"format" : "float" ,
"description" : "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic.\n\nWe generally recommend altering this or `top_p` but not both." ,
"example" : 1.0 ,
"nullable" : true
} ,
2024-03-29 12:17:24 -06:00
"tool_choice" : {
"allOf" : [
{
"$ref" : "#/components/schemas/ToolType"
}
] ,
"nullable" : true
} ,
"tool_prompt" : {
"type" : "string" ,
"description" : "A prompt to be appended before the tools" ,
2024-04-18 09:17:40 -06:00
"example" : "\"You will be presented with a JSON schema representing a set of tools.\nIf the user request lacks of sufficient information to make a precise tool selection: Do not invent any tool's properties, instead notify with an error message.\n\nJSON Schema:\n\"" ,
2024-03-29 12:17:24 -06:00
"nullable" : true
} ,
"tools" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Tool"
} ,
"description" : "A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of\nfunctions the model may generate JSON inputs for." ,
"example" : "null" ,
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"top_logprobs" : {
"type" : "integer" ,
"format" : "int32" ,
2024-02-16 09:50:57 -07:00
"description" : "An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with\nan associated log probability. logprobs must be set to true if this parameter is used." ,
2024-01-26 11:04:57 -07:00
"example" : "5" ,
"nullable" : true ,
"minimum" : 0
} ,
"top_p" : {
"type" : "number" ,
"format" : "float" ,
"description" : "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered." ,
"example" : 0.95 ,
"nullable" : true
}
}
} ,
2024-07-03 04:56:27 -06:00
"Chunk" : {
"type" : "object" ,
"required" : [
"id" ,
"created" ,
"choices" ,
"model" ,
"system_fingerprint"
] ,
"properties" : {
"choices" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/CompletionComplete"
}
} ,
"created" : {
"type" : "integer" ,
"format" : "int64" ,
"minimum" : 0
} ,
"id" : {
"type" : "string"
} ,
"model" : {
"type" : "string"
} ,
"system_fingerprint" : {
"type" : "string"
}
}
} ,
2024-01-26 11:04:57 -07:00
"CompatGenerateRequest" : {
"type" : "object" ,
"required" : [
"inputs"
] ,
"properties" : {
"inputs" : {
"type" : "string" ,
"example" : "My name is Olivier and I"
} ,
"parameters" : {
"$ref" : "#/components/schemas/GenerateParameters"
} ,
"stream" : {
"type" : "boolean" ,
"default" : "false"
}
}
} ,
2024-07-03 04:56:27 -06:00
"Completion" : {
"oneOf" : [
{
"allOf" : [
{
"$ref" : "#/components/schemas/Chunk"
} ,
{
"type" : "object" ,
"required" : [
"object"
] ,
"properties" : {
"object" : {
"type" : "string" ,
"enum" : [
"text_completion"
]
}
}
}
]
} ,
{
"allOf" : [
{
"$ref" : "#/components/schemas/CompletionFinal"
} ,
{
"type" : "object" ,
"required" : [
"object"
] ,
"properties" : {
"object" : {
"type" : "string" ,
"enum" : [
"text_completion"
]
}
}
}
]
}
] ,
"discriminator" : {
"propertyName" : "object"
}
} ,
2024-03-29 12:17:24 -06:00
"CompletionComplete" : {
"type" : "object" ,
"required" : [
"index" ,
"text" ,
"finish_reason"
] ,
"properties" : {
"finish_reason" : {
"type" : "string"
} ,
"index" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"logprobs" : {
"type" : "array" ,
"items" : {
"type" : "number" ,
"format" : "float"
} ,
"nullable" : true
} ,
"text" : {
"type" : "string"
}
}
} ,
2024-07-03 04:56:27 -06:00
"CompletionFinal" : {
2024-03-29 12:17:24 -06:00
"type" : "object" ,
"required" : [
"id" ,
"created" ,
2024-07-03 04:41:39 -06:00
"model" ,
2024-07-03 04:56:27 -06:00
"system_fingerprint" ,
"choices" ,
"usage"
2024-03-29 12:17:24 -06:00
] ,
"properties" : {
"choices" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/CompletionComplete"
}
} ,
"created" : {
"type" : "integer" ,
"format" : "int64" ,
2024-07-03 04:56:27 -06:00
"example" : "1706270835" ,
2024-03-29 12:17:24 -06:00
"minimum" : 0
} ,
"id" : {
"type" : "string"
} ,
"model" : {
2024-07-03 04:56:27 -06:00
"type" : "string" ,
"example" : "mistralai/Mistral-7B-Instruct-v0.2"
2024-03-29 12:17:24 -06:00
} ,
"system_fingerprint" : {
"type" : "string"
2024-07-03 04:56:27 -06:00
} ,
"usage" : {
"$ref" : "#/components/schemas/Usage"
2024-03-29 12:17:24 -06:00
}
}
} ,
"CompletionRequest" : {
"type" : "object" ,
"required" : [
"model" ,
"prompt"
] ,
"properties" : {
"frequency_penalty" : {
"type" : "number" ,
"format" : "float" ,
"description" : "Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim." ,
"example" : "1.0" ,
"nullable" : true
} ,
"max_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"description" : "The maximum number of tokens that can be generated in the chat completion." ,
"default" : "32" ,
"nullable" : true ,
"minimum" : 0
} ,
"model" : {
"type" : "string" ,
"description" : "UNUSED\nID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API." ,
"example" : "mistralai/Mistral-7B-Instruct-v0.2"
} ,
"prompt" : {
2024-07-03 01:53:35 -06:00
"$ref" : "#/components/schemas/Prompt"
2024-03-29 12:17:24 -06:00
} ,
"repetition_penalty" : {
"type" : "number" ,
"format" : "float" ,
"nullable" : true
} ,
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : 42 ,
"nullable" : true ,
"minimum" : 0
} ,
2024-07-03 01:53:35 -06:00
"stop" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
"description" : "Up to 4 sequences where the API will stop generating further tokens." ,
"example" : "null" ,
"nullable" : true
} ,
2024-03-29 12:17:24 -06:00
"stream" : {
"type" : "boolean"
} ,
"suffix" : {
"type" : "string" ,
"description" : "The text to append to the prompt. This is useful for completing sentences or generating a paragraph of text.\nplease see the completion_template field in the model's tokenizer_config.json file for completion template." ,
"nullable" : true
} ,
"temperature" : {
"type" : "number" ,
"format" : "float" ,
"description" : "What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while\nlower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or `top_p` but not both." ,
"example" : 1.0 ,
"nullable" : true
} ,
"top_p" : {
"type" : "number" ,
"format" : "float" ,
"description" : "An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the\ntokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered." ,
"example" : 0.95 ,
"nullable" : true
}
}
} ,
"DeltaToolCall" : {
"type" : "object" ,
"required" : [
"index" ,
"id" ,
"type" ,
"function"
] ,
"properties" : {
"function" : {
"$ref" : "#/components/schemas/Function"
} ,
"id" : {
"type" : "string"
} ,
"index" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"type" : {
"type" : "string"
}
}
} ,
2024-01-26 11:04:57 -07:00
"Details" : {
"type" : "object" ,
"required" : [
"finish_reason" ,
"generated_tokens" ,
"prefill" ,
"tokens"
] ,
"properties" : {
"best_of_sequences" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/BestOfSequence"
} ,
"nullable" : true
} ,
"finish_reason" : {
"$ref" : "#/components/schemas/FinishReason"
} ,
"generated_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 1 ,
"minimum" : 0
} ,
"prefill" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/PrefillToken"
}
} ,
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : 42 ,
"nullable" : true ,
"minimum" : 0
} ,
"tokens" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Token"
}
} ,
"top_tokens" : {
"type" : "array" ,
"items" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Token"
}
}
}
}
} ,
"ErrorResponse" : {
"type" : "object" ,
"required" : [
"error" ,
"error_type"
] ,
"properties" : {
"error" : {
"type" : "string"
} ,
"error_type" : {
"type" : "string"
}
}
} ,
"FinishReason" : {
"type" : "string" ,
"enum" : [
"length" ,
"eos_token" ,
"stop_sequence"
] ,
"example" : "Length"
} ,
2024-03-29 12:17:24 -06:00
"Function" : {
"type" : "object" ,
"required" : [
"arguments"
] ,
"properties" : {
"arguments" : {
"type" : "string"
} ,
"name" : {
"type" : "string" ,
"nullable" : true
}
}
} ,
"FunctionDefinition" : {
"type" : "object" ,
"required" : [
"name" ,
2024-04-18 09:17:40 -06:00
"arguments"
2024-03-29 12:17:24 -06:00
] ,
"properties" : {
2024-04-18 09:17:40 -06:00
"arguments" : { } ,
2024-03-29 12:17:24 -06:00
"description" : {
"type" : "string" ,
"nullable" : true
} ,
"name" : {
"type" : "string"
2024-04-18 09:17:40 -06:00
}
2024-03-29 12:17:24 -06:00
}
} ,
2024-01-26 11:04:57 -07:00
"GenerateParameters" : {
"type" : "object" ,
"properties" : {
2024-07-03 01:53:35 -06:00
"adapter_id" : {
"type" : "string" ,
"description" : "Lora adapter id" ,
"default" : "null" ,
"example" : "null" ,
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"best_of" : {
"type" : "integer" ,
2024-07-03 01:53:35 -06:00
"description" : "Generate best_of sequences and return the one if the highest token logprobs." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 1 ,
"nullable" : true ,
"minimum" : 0 ,
"exclusiveMinimum" : 0
} ,
"decoder_input_details" : {
"type" : "boolean" ,
2024-07-03 01:53:35 -06:00
"description" : "Whether to return decoder input token logprobs and ids." ,
2024-04-18 09:17:40 -06:00
"default" : "false"
2024-01-26 11:04:57 -07:00
} ,
"details" : {
"type" : "boolean" ,
2024-07-03 01:53:35 -06:00
"description" : "Whether to return generation details." ,
2024-01-26 11:04:57 -07:00
"default" : "true"
} ,
"do_sample" : {
"type" : "boolean" ,
2024-07-03 01:53:35 -06:00
"description" : "Activate logits sampling." ,
2024-01-26 11:04:57 -07:00
"default" : "false" ,
"example" : true
} ,
2024-02-16 09:50:57 -07:00
"frequency_penalty" : {
"type" : "number" ,
"format" : "float" ,
2024-07-03 01:53:35 -06:00
"description" : "The parameter for frequency penalty. 1.0 means no penalty\nPenalize new tokens based on their existing frequency in the text so far,\ndecreasing the model's likelihood to repeat the same line verbatim." ,
2024-02-16 09:50:57 -07:00
"default" : "null" ,
"example" : 0.1 ,
"nullable" : true ,
"exclusiveMinimum" : -2
} ,
"grammar" : {
"allOf" : [
{
"$ref" : "#/components/schemas/GrammarType"
}
] ,
2024-04-18 09:17:40 -06:00
"default" : "null" ,
2024-02-16 09:50:57 -07:00
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"max_new_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
2024-07-03 01:53:35 -06:00
"description" : "Maximum number of tokens to generate." ,
2024-01-26 11:04:57 -07:00
"default" : "100" ,
"example" : "20" ,
"nullable" : true ,
"minimum" : 0
} ,
"repetition_penalty" : {
"type" : "number" ,
"format" : "float" ,
2024-07-03 01:53:35 -06:00
"description" : "The parameter for repetition penalty. 1.0 means no penalty.\nSee [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 1.03 ,
"nullable" : true ,
"exclusiveMinimum" : 0
} ,
"return_full_text" : {
"type" : "boolean" ,
2024-07-03 01:53:35 -06:00
"description" : "Whether to prepend the prompt to the generated text" ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : false ,
"nullable" : true
} ,
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
2024-07-03 01:53:35 -06:00
"description" : "Random sampling seed." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : "null" ,
"nullable" : true ,
"minimum" : 0 ,
"exclusiveMinimum" : 0
} ,
"stop" : {
"type" : "array" ,
"items" : {
"type" : "string"
} ,
2024-07-03 01:53:35 -06:00
"description" : "Stop generating tokens if a member of `stop` is generated." ,
2024-01-26 11:04:57 -07:00
"example" : [
"photographer"
] ,
"maxItems" : 4
} ,
"temperature" : {
"type" : "number" ,
"format" : "float" ,
2024-07-03 01:53:35 -06:00
"description" : "The value used to module the logits distribution." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 0.5 ,
"nullable" : true ,
"exclusiveMinimum" : 0
} ,
"top_k" : {
"type" : "integer" ,
"format" : "int32" ,
2024-07-03 01:53:35 -06:00
"description" : "The number of highest probability vocabulary tokens to keep for top-k-filtering." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 10 ,
"nullable" : true ,
"exclusiveMinimum" : 0
} ,
"top_n_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
2024-07-03 01:53:35 -06:00
"description" : "The number of highest probability vocabulary tokens to keep for top-n-filtering." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 5 ,
"nullable" : true ,
"minimum" : 0 ,
"exclusiveMinimum" : 0
} ,
"top_p" : {
"type" : "number" ,
"format" : "float" ,
2024-07-03 01:53:35 -06:00
"description" : "Top-p value for nucleus sampling." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 0.95 ,
"nullable" : true ,
"maximum" : 1 ,
"exclusiveMinimum" : 0
} ,
"truncate" : {
"type" : "integer" ,
2024-07-03 01:53:35 -06:00
"description" : "Truncate inputs tokens to the given size." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : "null" ,
"nullable" : true ,
"minimum" : 0
} ,
"typical_p" : {
"type" : "number" ,
"format" : "float" ,
2024-07-03 01:53:35 -06:00
"description" : "Typical Decoding mass\nSee [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information." ,
2024-01-26 11:04:57 -07:00
"default" : "null" ,
"example" : 0.95 ,
"nullable" : true ,
"maximum" : 1 ,
"exclusiveMinimum" : 0
} ,
"watermark" : {
"type" : "boolean" ,
2024-07-03 01:53:35 -06:00
"description" : "Watermarking with [A Watermark for Large Language Models](https://arxiv.org/abs/2301.10226)." ,
2024-01-26 11:04:57 -07:00
"default" : "false" ,
"example" : true
}
}
} ,
"GenerateRequest" : {
"type" : "object" ,
"required" : [
"inputs"
] ,
"properties" : {
"inputs" : {
"type" : "string" ,
"example" : "My name is Olivier and I"
} ,
"parameters" : {
"$ref" : "#/components/schemas/GenerateParameters"
}
}
} ,
"GenerateResponse" : {
"type" : "object" ,
"required" : [
"generated_text"
] ,
"properties" : {
"details" : {
"allOf" : [
{
"$ref" : "#/components/schemas/Details"
}
] ,
"nullable" : true
} ,
"generated_text" : {
"type" : "string" ,
"example" : "test"
}
}
} ,
2024-02-21 03:05:32 -07:00
"GrammarType" : {
"oneOf" : [
{
"type" : "object" ,
"required" : [
"type" ,
"value"
] ,
"properties" : {
"type" : {
"type" : "string" ,
"enum" : [
"json"
]
} ,
"value" : {
2024-02-21 07:30:45 -07:00
"description" : "A string that represents a [JSON Schema](https://json-schema.org/).\n\nJSON Schema is a declarative language that allows to annotate JSON documents\nwith types and descriptions."
2024-02-21 03:05:32 -07:00
}
}
} ,
{
"type" : "object" ,
"required" : [
"type" ,
"value"
] ,
"properties" : {
"type" : {
"type" : "string" ,
"enum" : [
"regex"
]
} ,
"value" : {
"type" : "string"
}
}
}
] ,
"discriminator" : {
"propertyName" : "type"
}
} ,
2024-01-26 11:04:57 -07:00
"Info" : {
"type" : "object" ,
"required" : [
"model_id" ,
"model_dtype" ,
"model_device_type" ,
"max_concurrent_requests" ,
"max_best_of" ,
"max_stop_sequences" ,
2024-07-03 01:53:35 -06:00
"max_input_tokens" ,
2024-01-26 11:04:57 -07:00
"max_total_tokens" ,
"waiting_served_ratio" ,
"max_batch_total_tokens" ,
"max_waiting_tokens" ,
"validation_workers" ,
2024-04-18 09:17:40 -06:00
"max_client_batch_size" ,
2024-07-03 01:53:35 -06:00
"router" ,
2024-01-26 11:04:57 -07:00
"version"
] ,
"properties" : {
"docker_label" : {
"type" : "string" ,
"example" : "null" ,
"nullable" : true
} ,
2024-02-16 09:50:57 -07:00
"max_batch_size" : {
"type" : "integer" ,
"example" : "null" ,
"nullable" : true ,
"minimum" : 0
} ,
2024-01-26 11:04:57 -07:00
"max_batch_total_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : "32000" ,
"minimum" : 0
} ,
"max_best_of" : {
"type" : "integer" ,
"example" : "2" ,
"minimum" : 0
} ,
2024-04-18 09:17:40 -06:00
"max_client_batch_size" : {
"type" : "integer" ,
"example" : "32" ,
"minimum" : 0
} ,
2024-01-26 11:04:57 -07:00
"max_concurrent_requests" : {
"type" : "integer" ,
"description" : "Router Parameters" ,
"example" : "128" ,
"minimum" : 0
} ,
2024-07-03 01:53:35 -06:00
"max_input_tokens" : {
2024-01-26 11:04:57 -07:00
"type" : "integer" ,
"example" : "1024" ,
"minimum" : 0
} ,
"max_stop_sequences" : {
"type" : "integer" ,
"example" : "4" ,
"minimum" : 0
} ,
"max_total_tokens" : {
"type" : "integer" ,
"example" : "2048" ,
"minimum" : 0
} ,
"max_waiting_tokens" : {
"type" : "integer" ,
"example" : "20" ,
"minimum" : 0
} ,
"model_device_type" : {
"type" : "string" ,
"example" : "cuda"
} ,
"model_dtype" : {
"type" : "string" ,
"example" : "torch.float16"
} ,
"model_id" : {
"type" : "string" ,
"description" : "Model info" ,
"example" : "bigscience/blomm-560m"
} ,
"model_pipeline_tag" : {
"type" : "string" ,
"example" : "text-generation" ,
"nullable" : true
} ,
"model_sha" : {
"type" : "string" ,
"example" : "e985a63cdc139290c5f700ff1929f0b5942cced2" ,
"nullable" : true
} ,
2024-07-03 01:53:35 -06:00
"router" : {
"type" : "string" ,
"description" : "Router Info" ,
"example" : "text-generation-router"
} ,
2024-01-26 11:04:57 -07:00
"sha" : {
"type" : "string" ,
"example" : "null" ,
"nullable" : true
} ,
"validation_workers" : {
"type" : "integer" ,
"example" : "2" ,
"minimum" : 0
} ,
"version" : {
"type" : "string" ,
"example" : "0.5.0"
} ,
"waiting_served_ratio" : {
"type" : "number" ,
"format" : "float" ,
"example" : "1.2"
}
}
} ,
"Message" : {
"type" : "object" ,
"required" : [
2024-07-03 01:53:35 -06:00
"role" ,
"content"
2024-01-26 11:04:57 -07:00
] ,
"properties" : {
"content" : {
2024-07-03 01:53:35 -06:00
"$ref" : "#/components/schemas/MessageContent"
2024-01-26 11:04:57 -07:00
} ,
2024-02-16 09:50:57 -07:00
"name" : {
"type" : "string" ,
"example" : "\"David\"" ,
"nullable" : true
} ,
2024-01-26 11:04:57 -07:00
"role" : {
"type" : "string" ,
"example" : "user"
}
}
} ,
"PrefillToken" : {
"type" : "object" ,
"required" : [
"id" ,
"text" ,
"logprob"
] ,
"properties" : {
"id" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 0 ,
"minimum" : 0
} ,
"logprob" : {
"type" : "number" ,
"format" : "float" ,
"example" : -0.34 ,
"nullable" : true
} ,
"text" : {
"type" : "string" ,
"example" : "test"
}
}
} ,
2024-07-03 04:56:27 -06:00
"Prompt" : {
"type" : "array" ,
"items" : {
"type" : "string"
}
} ,
2024-01-26 11:04:57 -07:00
"SimpleToken" : {
"type" : "object" ,
"required" : [
"id" ,
"text" ,
"start" ,
"stop"
] ,
"properties" : {
"id" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 0 ,
"minimum" : 0
} ,
"start" : {
"type" : "integer" ,
"example" : 0 ,
"minimum" : 0
} ,
"stop" : {
"type" : "integer" ,
"example" : 2 ,
"minimum" : 0
} ,
"text" : {
"type" : "string" ,
"example" : "test"
}
}
} ,
"StreamDetails" : {
"type" : "object" ,
"required" : [
"finish_reason" ,
"generated_tokens"
] ,
"properties" : {
"finish_reason" : {
"$ref" : "#/components/schemas/FinishReason"
} ,
"generated_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 1 ,
"minimum" : 0
} ,
"seed" : {
"type" : "integer" ,
"format" : "int64" ,
"example" : 42 ,
"nullable" : true ,
"minimum" : 0
}
}
} ,
"StreamResponse" : {
"type" : "object" ,
"required" : [
"index" ,
"token"
] ,
"properties" : {
"details" : {
"allOf" : [
{
"$ref" : "#/components/schemas/StreamDetails"
}
] ,
"default" : "null" ,
"nullable" : true
} ,
"generated_text" : {
"type" : "string" ,
"default" : "null" ,
"example" : "test" ,
"nullable" : true
} ,
"index" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"token" : {
"$ref" : "#/components/schemas/Token"
} ,
"top_tokens" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/Token"
}
}
}
} ,
"Token" : {
"type" : "object" ,
"required" : [
"id" ,
"text" ,
"logprob" ,
"special"
] ,
"properties" : {
"id" : {
"type" : "integer" ,
"format" : "int32" ,
"example" : 0 ,
"minimum" : 0
} ,
"logprob" : {
"type" : "number" ,
"format" : "float" ,
"example" : -0.34 ,
"nullable" : true
} ,
"special" : {
"type" : "boolean" ,
"example" : "false"
} ,
"text" : {
"type" : "string" ,
"example" : "test"
}
}
} ,
"TokenizeResponse" : {
"type" : "array" ,
"items" : {
"$ref" : "#/components/schemas/SimpleToken"
}
2024-02-21 07:30:45 -07:00
} ,
2024-03-29 12:17:24 -06:00
"Tool" : {
"type" : "object" ,
"required" : [
"type" ,
"function"
] ,
"properties" : {
"function" : {
"$ref" : "#/components/schemas/FunctionDefinition"
} ,
"type" : {
"type" : "string" ,
"example" : "function"
}
}
} ,
"ToolCall" : {
"type" : "object" ,
"required" : [
"id" ,
"type" ,
"function"
] ,
"properties" : {
"function" : {
"$ref" : "#/components/schemas/FunctionDefinition"
} ,
"id" : {
2024-07-03 01:53:35 -06:00
"type" : "string"
2024-03-29 12:17:24 -06:00
} ,
"type" : {
"type" : "string"
}
}
} ,
"ToolType" : {
"oneOf" : [
2024-07-03 01:53:35 -06:00
{
"type" : "object" ,
"default" : null ,
"nullable" : true
} ,
{
"type" : "string"
} ,
2024-03-29 12:17:24 -06:00
{
"type" : "object" ,
"required" : [
2024-07-03 01:53:35 -06:00
"function"
2024-03-29 12:17:24 -06:00
] ,
"properties" : {
2024-07-03 01:53:35 -06:00
"function" : {
"$ref" : "#/components/schemas/FunctionName"
2024-03-29 12:17:24 -06:00
}
}
}
]
} ,
2024-02-21 07:30:45 -07:00
"Usage" : {
"type" : "object" ,
"required" : [
"prompt_tokens" ,
"completion_tokens" ,
"total_tokens"
] ,
"properties" : {
"completion_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"prompt_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
} ,
"total_tokens" : {
"type" : "integer" ,
"format" : "int32" ,
"minimum" : 0
}
}
2024-01-26 11:04:57 -07:00
}
}
} ,
"tags" : [
{
"name" : "Text Generation Inference" ,
"description" : "Hugging Face Text Generation Inference API"
}
]
2024-02-16 03:58:58 -07:00
}