syntax = "proto3"; package generate.v1; service TextGenerationService { /// Service discovery rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {} /// Empties batch cache rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse); /// Generate tokens for a batch rpc Generate (GenerateRequest) returns (GenerateResponse); /// Generate tokens for a list of cached batches rpc GenerateWithCache (GenerateWithCacheRequest) returns (GenerateWithCacheResponse); } /// Empty request message ServiceDiscoveryRequest {} message ServiceDiscoveryResponse { /// Other shards urls repeated string urls = 1; } /// Empty request message ClearCacheRequest {} /// Empty response message ClearCacheResponse {} message LogitsWarperParameters { float temperature = 1; uint32 top_k = 2; float top_p = 3; bool do_sample = 4; } message Request { /// Request ID uint64 id = 1; /// The generation context string inputs = 2; /// The number of tokens inside inputs uint32 input_length = 3; /// Logits Warper Parameters LogitsWarperParameters parameters = 4; /// Stopping criteria uint32 max_new_tokens = 5; } message Batch { /// Batch ID uint64 id = 1; /// Individual requests repeated Request requests = 2; /// Batch size (==len(requests)) uint32 size = 3; /// Length of the longest sequence within the batch (used for padding) uint32 max_sequence_length = 4; } message GeneratedText { /// Request Request request = 1; /// Output string output = 2; } message GenerateRequest { /// Batch Batch batch = 1; } message GenerateResponse { /// Finished requests repeated GeneratedText generated_texts = 1; /// Next batch (cached) optional Batch batch = 2; } message GenerateWithCacheRequest { /// Cached batches repeated Batch batches = 1; } message GenerateWithCacheResponse { /// Finished requests repeated GeneratedText generated_texts = 1; /// Next batch (cached) optional Batch batch = 2; }