hf_text-generation-inference/proto/v3/generate.proto

syntax = "proto3";

package generate.v3;

service TextGenerationService {
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
    /// Service discovery
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
    /// Remove requests from a cached batch
    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
    /// Warmup the model and compute max cache size
    rpc Warmup (WarmupRequest) returns (WarmupResponse);
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
    /// Health check
    rpc Health (HealthRequest) returns (HealthResponse);
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
    optional uint32 window_size = 4;
    uint32 speculate = 5;
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
    /// Other shards urls
    repeated string urls = 1;
}

message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}

/// Empty response
message ClearCacheResponse {}

message Image {
    /// Binary image data.
    bytes data = 1;

    /// Image MIME type.
    string mimetype = 2;
}

message InputChunk {
    oneof chunk {
        /// Plain text data
        string text = 1;
        /// Image data
        Image image = 2;
    }
}

message Input {
    repeated InputChunk chunks = 1;
  }

enum GrammarType {
    GRAMMAR_TYPE_NONE = 0;
    GRAMMAR_TYPE_JSON = 1;
    GRAMMAR_TYPE_REGEX = 2;
}

message NextTokenChooserParameters {
    /// exponential scaling output probability distribution
    float temperature = 1;
    /// restricting to the k highest probability elements
    uint32 top_k = 2;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float top_p = 3;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
    /// apply sampling on the logits
    bool do_sample = 5;
    /// random seed for sampling
    uint64 seed = 6;
    /// repetition penalty
    float repetition_penalty = 7;
    /// frequency penalty
    float frequency_penalty = 9;
    /// token watermarking using "A Watermark for Large Language Models"
    bool watermark = 8;
    /// grammar (applied if not empty)
    string grammar = 10;
    /// grammar type
    GrammarType grammar_type = 11;
}

message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
}

message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context as chunks
    Input input_chunks = 8;
    /// The generation context, stringified input_chunks
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;
    /// Next Token Chooser Parameters
    NextTokenChooserParameters parameters = 4;
    /// Stopping Criteria Parameters
    StoppingCriteriaParameters stopping_parameters = 5;
    /// Return prefill logprobs
    bool prefill_logprobs = 6;
    /// Return most likely n tokens
    uint32 top_n_tokens = 7;
    /// Paged attention blocks
    repeated uint32 blocks = 9;
    /// Paged attention slots
    repeated uint32  slots = 10;
    /// LORA adapter index
    optional string adapter_id = 11;
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
    /// Maximum number of Paged Attention blocks
    uint32 max_blocks = 5;
}

message CachedBatch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests ids
    repeated uint64 request_ids = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

message GeneratedText {
    /// Output
    string text = 1;
    /// Number of generated tokens
    uint32 generated_tokens = 2;
    /// Finish reason
    FinishReason finish_reason = 3;
    /// Seed
    optional uint64 seed = 4;
}

message Tokens {
    /// Token IDs
    repeated uint32 ids = 1;
    /// Logprobs
    repeated float logprobs = 2;
    /// tokens
    repeated string texts = 3;
    /// special
    repeated bool is_special = 4;
}

message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    Tokens prefill_tokens = 2;
    Tokens tokens = 3;
    /// Complete generated text
    optional GeneratedText generated_text = 4;
    /// Top tokens
    repeated Tokens top_tokens = 5;
}

message FilterBatchRequest {
    /// Batch ID
    uint64 batch_id = 1;
    /// Requests to keep
    repeated uint64 request_ids = 2;
}

message FilterBatchResponse {
    /// Filtered Batch (cached)
    CachedBatch batch = 1;
}


message PrefillRequest {
    /// Batch
    Batch batch = 1;
}

message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
}

message DecodeRequest {
    /// Cached batches
    repeated CachedBatch batches = 1;
}

message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
    /// Concatenate elapsed time in nanoseconds
    optional uint64 concat_ns = 6;
}

message WarmupRequest {
    /// Batch to warmup on
    Batch batch = 1;
    uint32 max_input_length = 2;
    uint32 max_prefill_tokens = 3;
    uint32 max_total_tokens = 4;
}

message WarmupResponse {
    /// Maximum number of tokens supported by the model
    optional uint32 max_supported_total_tokens = 1;
}
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 07:56:56 -06:00			`syntax = "proto3";`

			`package generate.v3;`

			`service TextGenerationService {`
			`/// Model Info`
			`rpc Info (InfoRequest) returns (InfoResponse) {}`
			`/// Service discovery`
			`rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}`
			`/// Empties batch cache`
			`rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);`
			`/// Remove requests from a cached batch`
			`rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);`
			`/// Warmup the model and compute max cache size`
			`rpc Warmup (WarmupRequest) returns (WarmupResponse);`
			`/// Prefill batch and decode first token`
			`rpc Prefill (PrefillRequest) returns (PrefillResponse);`
			`/// Decode token for a list of prefilled batches`
			`rpc Decode (DecodeRequest) returns (DecodeResponse);`
			`/// Health check`
			`rpc Health (HealthRequest) returns (HealthResponse);`
			`}`

			`message HealthRequest {}`
			`message HealthResponse {}`

			`/// Empty request`
			`message InfoRequest {}`

			`message InfoResponse {`
			`bool requires_padding = 1;`
			`string dtype = 2;`
			`string device_type = 3;`
			`optional uint32 window_size = 4;`
			`uint32 speculate = 5;`
			`}`

			`/// Empty request`
			`message ServiceDiscoveryRequest {}`

			`message ServiceDiscoveryResponse {`
			`/// Other shards urls`
			`repeated string urls = 1;`
			`}`

			`message ClearCacheRequest {`
			`/// Optional batch id`
			`optional uint64 id = 1;`
			`}`

			`/// Empty response`
			`message ClearCacheResponse {}`

			`message Image {`
			`/// Binary image data.`
			`bytes data = 1;`

			`/// Image MIME type.`
			`string mimetype = 2;`
			`}`

			`message InputChunk {`
			`oneof chunk {`
			`/// Plain text data`
			`string text = 1;`
			`/// Image data`
			`Image image = 2;`
			`}`
			`}`

			`message Input {`
			`repeated InputChunk chunks = 1;`
			`}`

			`enum GrammarType {`
			`GRAMMAR_TYPE_NONE = 0;`
			`GRAMMAR_TYPE_JSON = 1;`
			`GRAMMAR_TYPE_REGEX = 2;`
			`}`

			`message NextTokenChooserParameters {`
			`/// exponential scaling output probability distribution`
			`float temperature = 1;`
			`/// restricting to the k highest probability elements`
			`uint32 top_k = 2;`
			`/// restricting to top tokens summing to prob_cut_off <= prob_cut_off`
			`float top_p = 3;`
			`/// restricting to top tokens summing to prob_cut_off <= prob_cut_off`
			`float typical_p = 4;`
			`/// apply sampling on the logits`
			`bool do_sample = 5;`
			`/// random seed for sampling`
			`uint64 seed = 6;`
			`/// repetition penalty`
			`float repetition_penalty = 7;`
			`/// frequency penalty`
			`float frequency_penalty = 9;`
			`/// token watermarking using "A Watermark for Large Language Models"`
			`bool watermark = 8;`
			`/// grammar (applied if not empty)`
			`string grammar = 10;`
			`/// grammar type`
			`GrammarType grammar_type = 11;`
			`}`

			`message StoppingCriteriaParameters {`
			`/// Maximum number of generated tokens`
			`uint32 max_new_tokens = 1;`
			`/// Optional stopping sequences`
			`repeated string stop_sequences = 2;`
			`/// Ignore end of sequence token`
			`/// used for benchmarking`
			`bool ignore_eos_token = 3;`
			`}`

			`message Request {`
			`/// Request ID`
			`uint64 id = 1;`
			`/// The generation context as chunks`
			`Input input_chunks = 8;`
			`/// The generation context, stringified input_chunks`
			`string inputs = 2;`
			`/// Context truncation`
			`uint32 truncate = 3;`
			`/// Next Token Chooser Parameters`
			`NextTokenChooserParameters parameters = 4;`
			`/// Stopping Criteria Parameters`
			`StoppingCriteriaParameters stopping_parameters = 5;`
			`/// Return prefill logprobs`
			`bool prefill_logprobs = 6;`
			`/// Return most likely n tokens`
			`uint32 top_n_tokens = 7;`
feat: move allocation logic to rust (#1835) Close #2007 2024-06-05 04:18:38 -06:00			`/// Paged attention blocks`
			`repeated uint32 blocks = 9;`
			`/// Paged attention slots`
			`repeated uint32 slots = 10;`
Enable multiple LoRa adapters (#2010) * feat: first draft load multiple lora * feat: load weights within layer and refactor lora pass * fix: refactor and reduce lora math * feat: baseline impl single request multi lora support * feat: prefer lorax implementation and port loading logic * fix: prefer adapter_data and refactors * feat: perfer loraxs custom punica kernels and add mlp loras * fix: adjust batch for bgmv * fix: adjust adapter_segments logic when in batch * fix: refactor and move changes to v3 proto * fix: pass model_id for all flash causal lms * fix: pass model_id for all causal and seq2seq lms * fix: add model_id to model test * feat: add lora support to mistral and refactors * feat: prefer model id in request * fix: include rust code for adapter id * feat: bump launcher and add new lora docs * feat: support base model generation and refactors * fix: rename doc to retry ci build * feat: support if vlm models * fix: add adapter_data param and avoid missing layers * fix: add adapter_data param to phi and neox * fix: update all models forwards to include adapter_data * fix: add model_id to IdeficsCausalLM * Update lora.md Fixed a typo * Update lora.md Fixing spam image * fix: add lora kernel to dockerfile, support running without kernels and refactors * fix: avoid dockerfile conflict * fix: refactors and adjust flash llama lora logic * fix: skip llama test due to CI issue (temp) * fix: skip llama test CI (temp) 2 * fix: revert skips and prefer updated ci token for tests * fix: refactors and helpful comments * fix: add noop in TensorParallelAdapterRowLinear too * fix: refactor and move shard_lora_weights logic * fix: exit early if no adapter_data --------- Co-authored-by: Derek <datavistics@gmail.com> 2024-06-25 12:46:27 -06:00			`/// LORA adapter index`
			`optional string adapter_id = 11;`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 07:56:56 -06:00			`}`

			`message Batch {`
			`/// Batch ID`
			`uint64 id = 1;`
			`/// Individual requests`
			`repeated Request requests = 2;`
			`/// Batch size (==len(requests))`
			`uint32 size = 3;`
			`/// Maximum number of tokens this batch will grow to`
			`uint32 max_tokens = 4;`
feat: move allocation logic to rust (#1835) Close #2007 2024-06-05 04:18:38 -06:00			`/// Maximum number of Paged Attention blocks`
			`uint32 max_blocks = 5;`
feat: add SchedulerV3 (#1996) - Refactor code to allow supporting multiple versions of the generate.proto at the same time - Add v3/generate.proto (ISO to generate.proto for now but allow for future changes without impacting v2 backends) - Add Schedule trait to abstract queuing and batching mechanisms that will be different in the future - Add SchedulerV2/V3 impl 2024-06-04 07:56:56 -06:00			`}`

			`message CachedBatch {`
			`/// Batch ID`
			`uint64 id = 1;`
			`/// Individual requests ids`
			`repeated uint64 request_ids = 2;`
			`/// Batch size (==len(requests))`
			`uint32 size = 3;`
			`/// Maximum number of tokens this batch will grow to`
			`uint32 max_tokens = 4;`
			`}`

			`enum FinishReason {`
			`FINISH_REASON_LENGTH = 0;`
			`FINISH_REASON_EOS_TOKEN = 1;`
			`FINISH_REASON_STOP_SEQUENCE = 2;`
			`}`

			`message GeneratedText {`
			`/// Output`
			`string text = 1;`
			`/// Number of generated tokens`
			`uint32 generated_tokens = 2;`
			`/// Finish reason`
			`FinishReason finish_reason = 3;`
			`/// Seed`
			`optional uint64 seed = 4;`
			`}`

			`message Tokens {`
			`/// Token IDs`
			`repeated uint32 ids = 1;`
			`/// Logprobs`
			`repeated float logprobs = 2;`
			`/// tokens`
			`repeated string texts = 3;`
			`/// special`
			`repeated bool is_special = 4;`
			`}`

			`message Generation {`
			`/// Request ID`
			`uint64 request_id = 1;`
			`/// Prefill tokens (optional)`
			`Tokens prefill_tokens = 2;`
			`Tokens tokens = 3;`
			`/// Complete generated text`
			`optional GeneratedText generated_text = 4;`
			`/// Top tokens`
			`repeated Tokens top_tokens = 5;`
			`}`

			`message FilterBatchRequest {`
			`/// Batch ID`
			`uint64 batch_id = 1;`
			`/// Requests to keep`
			`repeated uint64 request_ids = 2;`
			`}`

			`message FilterBatchResponse {`
			`/// Filtered Batch (cached)`
			`CachedBatch batch = 1;`
			`}`


			`message PrefillRequest {`
			`/// Batch`
			`Batch batch = 1;`
			`}`

			`message PrefillResponse {`
			`/// Generation`
			`repeated Generation generations = 1;`
			`/// Next batch (cached)`
			`optional CachedBatch batch = 2;`
			`/// Forward elapsed time in nanoseconds`
			`uint64 forward_ns = 3;`
			`/// Decode elapsed time in nanoseconds`
			`uint64 decode_ns = 4;`
			`/// Total elapsed time in nanoseconds`
			`uint64 total_ns = 5;`
			`}`

			`message DecodeRequest {`
			`/// Cached batches`
			`repeated CachedBatch batches = 1;`
			`}`

			`message DecodeResponse {`
			`/// Decodes`
			`repeated Generation generations = 1;`
			`/// Next batch (cached)`
			`optional CachedBatch batch = 2;`
			`/// Forward elapsed time in nanoseconds`
			`uint64 forward_ns = 3;`
			`/// Decode elapsed time in nanoseconds`
			`uint64 decode_ns = 4;`
			`/// Total elapsed time in nanoseconds`
			`uint64 total_ns = 5;`
			`/// Concatenate elapsed time in nanoseconds`
			`optional uint64 concat_ns = 6;`
			`}`

			`message WarmupRequest {`
			`/// Batch to warmup on`
			`Batch batch = 1;`
			`uint32 max_input_length = 2;`
			`uint32 max_prefill_tokens = 3;`
			`uint32 max_total_tokens = 4;`
			`}`

			`message WarmupResponse {`
			`/// Maximum number of tokens supported by the model`
			`optional uint32 max_supported_total_tokens = 1;`
			`}`