hf_text-generation-inference/proto/generate.proto

syntax = "proto3";

package generate.v1;

service TextGenerationService {
    /// Service discovery
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
    /// Other shards urls
    repeated string urls = 1;
}

/// Empty request
message ClearCacheRequest {}

/// Empty response
message ClearCacheResponse {}

message NextTokenChooserParameters {
    /// exponential scaling output probability distribution
    float temperature = 1;
    /// restricting to the k highest probability elements
    uint32 top_k = 2;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float top_p = 3;
    /// apply sampling on the logits
    bool do_sample = 4;
    /// random seed for sampling
    uint64 seed = 5;
}

message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
}

message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
    /// The number of tokens inside inputs
    uint32 input_length = 3;
    /// Next Token Chooser Parameters
    NextTokenChooserParameters parameters = 4;
    /// Stopping Criteria Parameters
    StoppingCriteriaParameters stopping_parameters = 5;
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
}

message GeneratedText {
    /// Output
    string text = 1;
    /// Number of generated tokens
    uint32 generated_tokens = 2;
    /// Finish reason
    string finish_reason = 3;
    /// Seed
    optional uint64 seed = 4;
}

message PrefillTokens {
    /// Prefill Token IDs
    repeated uint32 ids = 1;
    /// Prefill Logprobs
    repeated float logprobs = 2;
    /// Prefill tokens
    repeated string texts = 3;
}

message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    PrefillTokens prefill_tokens = 2;
    /// Token ID
    uint32 token_id = 3;
    /// Logprob
    float token_logprob = 4;
    /// Text
    string token_text = 5;
    /// Complete generated text
    GeneratedText generated_text = 6;
}

message PrefillRequest {
    /// Batch
    Batch batch = 1;
}

message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional Batch batch = 2;
}

message DecodeRequest {
    /// Cached batches
    repeated Batch batches = 1;
}

message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional Batch batch = 2;
}
Init 2022-10-08 04:30:12 -06:00			`syntax = "proto3";`

			`package generate.v1;`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`service TextGenerationService {`
Init 2022-10-08 04:30:12 -06:00			`/// Service discovery`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}`
Init 2022-10-08 04:30:12 -06:00			`/// Empties batch cache`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`/// Prefill batch and decode first token`
			`rpc Prefill (PrefillRequest) returns (PrefillResponse);`
			`/// Decode token for a list of prefilled batches`
			`rpc Decode (DecodeRequest) returns (DecodeResponse);`
Init 2022-10-08 04:30:12 -06:00			`}`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Empty request`
			`message ServiceDiscoveryRequest {}`

Init 2022-10-08 04:30:12 -06:00			`message ServiceDiscoveryResponse {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Other shards urls`
Init 2022-10-08 04:30:12 -06:00			`repeated string urls = 1;`
			`}`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Empty request`
			`message ClearCacheRequest {}`

			`/// Empty response`
			`message ClearCacheResponse {}`

feat: Return logprobs (#8) 2022-12-15 09:03:56 -07:00			`message NextTokenChooserParameters {`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// exponential scaling output probability distribution`
Init 2022-10-08 04:30:12 -06:00			`float temperature = 1;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// restricting to the k highest probability elements`
Init 2022-10-08 04:30:12 -06:00			`uint32 top_k = 2;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// restricting to top tokens summing to prob_cut_off <= prob_cut_off`
Init 2022-10-08 04:30:12 -06:00			`float top_p = 3;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// apply sampling on the logits`
Init 2022-10-08 04:30:12 -06:00			`bool do_sample = 4;`
feat: Support sampling seeding (#37) Co-authored-by: Yannic Kilcher <yk@users.noreply.github.com> 2023-01-30 07:36:16 -07:00			`/// random seed for sampling`
fix(server): fix seeding with multiple shards (#44) 2023-01-31 08:01:15 -07:00			`uint64 seed = 5;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`message StoppingCriteriaParameters {`
			`/// Maximum number of generated tokens`
			`uint32 max_new_tokens = 1;`
			`/// Optional stopping sequences`
			`repeated string stop_sequences = 2;`
			`}`

Init 2022-10-08 04:30:12 -06:00			`message Request {`
			`/// Request ID`
			`uint64 id = 1;`
			`/// The generation context`
			`string inputs = 2;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// The number of tokens inside inputs`
			`uint32 input_length = 3;`
feat: Return logprobs (#8) 2022-12-15 09:03:56 -07:00			`/// Next Token Chooser Parameters`
			`NextTokenChooserParameters parameters = 4;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// Stopping Criteria Parameters`
			`StoppingCriteriaParameters stopping_parameters = 5;`
Init 2022-10-08 04:30:12 -06:00			`}`

			`message Batch {`
			`/// Batch ID`
			`uint64 id = 1;`
			`/// Individual requests`
			`repeated Request requests = 2;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Batch size (==len(requests))`
			`uint32 size = 3;`
Init 2022-10-08 04:30:12 -06:00			`}`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`message GeneratedText {`
Init 2022-10-08 04:30:12 -06:00			`/// Output`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`string text = 1;`
feat(server): Support generic AutoModelForCausalLM 2022-11-04 07:22:47 -06:00			`/// Number of generated tokens`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`uint32 generated_tokens = 2;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// Finish reason`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`string finish_reason = 3;`
feat: Support sampling seeding (#37) Co-authored-by: Yannic Kilcher <yk@users.noreply.github.com> 2023-01-30 07:36:16 -07:00			`/// Seed`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`optional uint64 seed = 4;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message PrefillTokens {`
			`/// Prefill Token IDs`
			`repeated uint32 ids = 1;`
			`/// Prefill Logprobs`
			`repeated float logprobs = 2;`
			`/// Prefill tokens`
			`repeated string texts = 3;`
			`}`

			`message Generation {`
			`/// Request ID`
			`uint64 request_id = 1;`
			`/// Prefill tokens (optional)`
			`PrefillTokens prefill_tokens = 2;`
			`/// Token ID`
			`uint32 token_id = 3;`
			`/// Logprob`
			`float token_logprob = 4;`
			`/// Text`
			`string token_text = 5;`
			`/// Complete generated text`
			`GeneratedText generated_text = 6;`
			`}`

			`message PrefillRequest {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Batch`
			`Batch batch = 1;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message PrefillResponse {`
			`/// Generation`
			`repeated Generation generations = 1;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Next batch (cached)`
			`optional Batch batch = 2;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message DecodeRequest {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Cached batches`
			`repeated Batch batches = 1;`
			`}`
Init 2022-10-08 04:30:12 -06:00
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message DecodeResponse {`
			`/// Decodes`
			`repeated Generation generations = 1;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Next batch (cached)`
			`optional Batch batch = 2;`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`}`