{
  "openapi": "3.0.3",
  "info": {
    "title": "Text Generation Inference",
    "description": "Text Generation Webserver",
    "contact": {
      "name": "Olivier Dehaene"
    },
    "license": {
      "name": "Apache 2.0",
      "url": "https://www.apache.org/licenses/LICENSE-2.0"
    },
    "version": "1.3.4"
  },
  "paths": {
    "/": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
        "description": "Generate tokens if `stream == false` or a stream of token if `stream == true`",
        "operationId": "compat_generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/CompatGenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/GenerateResponse"
                }
              },
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/generate": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate tokens",
        "description": "Generate tokens",
        "operationId": "generate",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/GenerateResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/generate_stream": {
      "post": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Generate a stream of token using Server-Sent Events",
        "description": "Generate a stream of token using Server-Sent Events",
        "operationId": "generate_stream",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/GenerateRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "Generated Text",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/StreamResponse"
                }
              }
            }
          },
          "422": {
            "description": "Input validation error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Input validation error"
                }
              }
            }
          },
          "424": {
            "description": "Generation Error",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Request failed during generation"
                }
              }
            }
          },
          "429": {
            "description": "Model is overloaded",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Model is overloaded"
                }
              }
            }
          },
          "500": {
            "description": "Incomplete generation",
            "content": {
              "text/event-stream": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "Incomplete generation"
                }
              }
            }
          }
        }
      }
    },
    "/health": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Health check method",
        "description": "Health check method",
        "operationId": "health",
        "responses": {
          "200": {
            "description": "Everything is working fine"
          },
          "503": {
            "description": "Text generation inference is down",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ErrorResponse"
                },
                "example": {
                  "error": "unhealthy",
                  "error_type": "healthcheck"
                }
              }
            }
          }
        }
      }
    },
    "/info": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Text Generation Inference endpoint info",
        "description": "Text Generation Inference endpoint info",
        "operationId": "get_model_info",
        "responses": {
          "200": {
            "description": "Served model info",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/Info"
                }
              }
            }
          }
        }
      }
    },
    "/metrics": {
      "get": {
        "tags": [
          "Text Generation Inference"
        ],
        "summary": "Prometheus metrics scrape endpoint",
        "description": "Prometheus metrics scrape endpoint",
        "operationId": "metrics",
        "responses": {
          "200": {
            "description": "Prometheus Metrics",
            "content": {
              "text/plain": {
                "schema": {
                  "type": "string"
                }
              }
            }
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "BestOfSequence": {
        "type": "object",
        "required": [
          "generated_text",
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "CompatGenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          },
          "stream": {
            "type": "boolean",
            "default": "false"
          }
        }
      },
      "Details": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens",
          "prefill",
          "tokens"
        ],
        "properties": {
          "best_of_sequences": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/BestOfSequence"
            },
            "nullable": true
          },
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "prefill": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/PrefillToken"
            }
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          },
          "tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "type": "array",
              "items": {
                "$ref": "#/components/schemas/Token"
              }
            }
          }
        }
      },
      "ErrorResponse": {
        "type": "object",
        "required": [
          "error",
          "error_type"
        ],
        "properties": {
          "error": {
            "type": "string"
          },
          "error_type": {
            "type": "string"
          }
        }
      },
      "FinishReason": {
        "type": "string",
        "enum": [
          "length",
          "eos_token",
          "stop_sequence"
        ]
      },
      "GenerateParameters": {
        "type": "object",
        "properties": {
          "best_of": {
            "type": "integer",
            "default": "null",
            "example": 1,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "decoder_input_details": {
            "type": "boolean",
            "default": "true"
          },
          "details": {
            "type": "boolean",
            "default": "true"
          },
          "do_sample": {
            "type": "boolean",
            "default": "false",
            "example": true
          },
          "max_new_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "20",
            "example": "20",
            "nullable": true,
            "minimum": 0
          },
          "repetition_penalty": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 1.03,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "return_full_text": {
            "type": "boolean",
            "default": "null",
            "example": false,
            "nullable": true
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "stop": {
            "type": "array",
            "items": {
              "type": "string"
            },
            "example": [
              "photographer"
            ],
            "maxItems": 4
          },
          "temperature": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.5,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_k": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 10,
            "nullable": true,
            "exclusiveMinimum": 0
          },
          "top_n_tokens": {
            "type": "integer",
            "format": "int32",
            "default": "null",
            "example": 5,
            "nullable": true,
            "minimum": 0,
            "exclusiveMinimum": 0
          },
          "top_p": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "truncate": {
            "type": "integer",
            "default": "null",
            "example": "null",
            "nullable": true,
            "minimum": 0
          },
          "typical_p": {
            "type": "number",
            "format": "float",
            "default": "null",
            "example": 0.95,
            "nullable": true,
            "maximum": 1,
            "exclusiveMinimum": 0
          },
          "watermark": {
            "type": "boolean",
            "default": "false",
            "example": true
          }
        }
      },
      "GenerateRequest": {
        "type": "object",
        "required": [
          "inputs"
        ],
        "properties": {
          "inputs": {
            "type": "string",
            "example": "My name is Olivier and I"
          },
          "parameters": {
            "$ref": "#/components/schemas/GenerateParameters"
          }
        }
      },
      "GenerateResponse": {
        "type": "object",
        "required": [
          "generated_text"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/Details"
              }
            ],
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "Info": {
        "type": "object",
        "required": [
          "model_id",
          "model_dtype",
          "model_device_type",
          "max_concurrent_requests",
          "max_best_of",
          "max_stop_sequences",
          "max_input_length",
          "max_total_tokens",
          "waiting_served_ratio",
          "max_batch_total_tokens",
          "max_waiting_tokens",
          "validation_workers",
          "version"
        ],
        "properties": {
          "docker_label": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "max_batch_total_tokens": {
            "type": "integer",
            "format": "int32",
            "example": "32000",
            "minimum": 0
          },
          "max_best_of": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "max_concurrent_requests": {
            "type": "integer",
            "description": "Router Parameters",
            "example": "128",
            "minimum": 0
          },
          "max_input_length": {
            "type": "integer",
            "example": "1024",
            "minimum": 0
          },
          "max_stop_sequences": {
            "type": "integer",
            "example": "4",
            "minimum": 0
          },
          "max_total_tokens": {
            "type": "integer",
            "example": "2048",
            "minimum": 0
          },
          "max_waiting_tokens": {
            "type": "integer",
            "example": "20",
            "minimum": 0
          },
          "model_device_type": {
            "type": "string",
            "example": "cuda"
          },
          "model_dtype": {
            "type": "string",
            "example": "torch.float16"
          },
          "model_id": {
            "type": "string",
            "description": "Model info",
            "example": "bigscience/blomm-560m"
          },
          "model_pipeline_tag": {
            "type": "string",
            "example": "text-generation",
            "nullable": true
          },
          "model_sha": {
            "type": "string",
            "example": "e985a63cdc139290c5f700ff1929f0b5942cced2",
            "nullable": true
          },
          "sha": {
            "type": "string",
            "example": "null",
            "nullable": true
          },
          "validation_workers": {
            "type": "integer",
            "example": "2",
            "minimum": 0
          },
          "version": {
            "type": "string",
            "description": "Router Info",
            "example": "0.5.0"
          },
          "waiting_served_ratio": {
            "type": "number",
            "format": "float",
            "example": "1.2"
          }
        }
      },
      "PrefillToken": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      },
      "StreamDetails": {
        "type": "object",
        "required": [
          "finish_reason",
          "generated_tokens"
        ],
        "properties": {
          "finish_reason": {
            "$ref": "#/components/schemas/FinishReason"
          },
          "generated_tokens": {
            "type": "integer",
            "format": "int32",
            "example": 1,
            "minimum": 0
          },
          "seed": {
            "type": "integer",
            "format": "int64",
            "example": 42,
            "nullable": true,
            "minimum": 0
          }
        }
      },
      "StreamResponse": {
        "type": "object",
        "required": [
          "token"
        ],
        "properties": {
          "details": {
            "allOf": [
              {
                "$ref": "#/components/schemas/StreamDetails"
              }
            ],
            "default": "null",
            "nullable": true
          },
          "generated_text": {
            "type": "string",
            "default": "null",
            "example": "test",
            "nullable": true
          },
          "token": {
            "$ref": "#/components/schemas/Token"
          },
          "top_tokens": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Token"
            }
          }
        }
      },
      "Token": {
        "type": "object",
        "required": [
          "id",
          "text",
          "logprob",
          "special"
        ],
        "properties": {
          "id": {
            "type": "integer",
            "format": "int32",
            "example": 0,
            "minimum": 0
          },
          "logprob": {
            "type": "number",
            "format": "float",
            "example": -0.34,
            "nullable": true
          },
          "special": {
            "type": "boolean",
            "example": "false"
          },
          "text": {
            "type": "string",
            "example": "test"
          }
        }
      }
    }
  },
  "tags": [
    {
      "name": "Text Generation Inference",
      "description": "Hugging Face Text Generation Inference API"
    }
  ]
}