syntax = "proto3"; package generate.v1; service TextGenerationService { /// Service discovery rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {} /// Empties batch cache rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse); /// Generate tokens for a batch rpc Generate (GenerateRequest) returns (GenerateResponse); /// Generate tokens for a list of cached batches rpc GenerateWithCache (GenerateWithCacheRequest) returns (GenerateWithCacheResponse); } /// Empty request message ServiceDiscoveryRequest {} message ServiceDiscoveryResponse { /// Other shards urls repeated string urls = 1; } /// Empty request message ClearCacheRequest {} /// Empty response message ClearCacheResponse {} message NextTokenChooserParameters { /// exponential scaling output probability distribution float temperature = 1; /// restricting to the k highest probability elements uint32 top_k = 2; /// restricting to top tokens summing to prob_cut_off <= prob_cut_off float top_p = 3; /// apply sampling on the logits bool do_sample = 4; } message StoppingCriteriaParameters { /// Maximum number of generated tokens uint32 max_new_tokens = 1; /// Optional stopping sequences repeated string stop_sequences = 2; } message Request { /// Request ID uint64 id = 1; /// The generation context string inputs = 2; /// The number of tokens inside inputs uint32 input_length = 3; /// Next Token Chooser Parameters NextTokenChooserParameters parameters = 4; /// Stopping Criteria Parameters StoppingCriteriaParameters stopping_parameters = 5; } message Batch { /// Batch ID uint64 id = 1; /// Individual requests repeated Request requests = 2; /// Batch size (==len(requests)) uint32 size = 3; } message GeneratedText { /// Request Request request = 1; /// Output string output_text = 2; /// Number of generated tokens uint32 generated_tokens = 3; /// Tokens repeated string tokens = 4; /// Token IDs repeated uint32 token_ids = 5; /// Logprobs repeated float logprobs = 6; /// Finish reason string finish_reason = 7; } message GenerateRequest { /// Batch Batch batch = 1; } message GenerateResponse { /// Finished requests repeated GeneratedText generated_texts = 1; /// Next batch (cached) optional Batch batch = 2; } message GenerateWithCacheRequest { /// Cached batches repeated Batch batches = 1; } message GenerateWithCacheResponse { /// Finished requests repeated GeneratedText generated_texts = 1; /// Next batch (cached) optional Batch batch = 2; }