preemo_text-generation-infe.../proto/generate.proto

119 lines
3.0 KiB
Protocol Buffer
Raw Normal View History

2022-10-08 04:30:12 -06:00
syntax = "proto3";
package generate.v1;
service TextGenerationService {
2022-10-08 04:30:12 -06:00
/// Service discovery
rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
2022-10-08 04:30:12 -06:00
/// Empties batch cache
rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
/// Generate tokens for a batch
rpc Generate (GenerateRequest) returns (GenerateResponse);
/// Generate tokens for a list of cached batches
rpc GenerateWithCache (GenerateWithCacheRequest) returns (GenerateWithCacheResponse);
/// Generate tokens until the text of at least one request of the batch is generated
rpc GenerateUntilFinished (GenerateUntilFinishedRequest) returns (GenerateUntilFinishedResponse);
/// Generate tokens until the text of at least one request of the cached batches i finished
rpc GenerateUntilFinishedWithCache (GenerateUntilFinishedWithCacheRequest) returns (GenerateUntilFinishedWithCacheResponse);
2022-10-08 04:30:12 -06:00
}
/// Empty request
message ServiceDiscoveryRequest {}
2022-10-08 04:30:12 -06:00
message ServiceDiscoveryResponse {
/// Other shards urls
2022-10-08 04:30:12 -06:00
repeated string urls = 1;
}
/// Empty request
message ClearCacheRequest {}
/// Empty response
message ClearCacheResponse {}
2022-10-08 04:30:12 -06:00
message LogitsWarperParameters {
float temperature = 1;
uint32 top_k = 2;
float top_p = 3;
bool do_sample = 4;
}
message Request {
/// Request ID
uint64 id = 1;
/// The generation context
string inputs = 2;
/// The number of tokens inside inputs
uint32 input_length = 3;
2022-10-08 04:30:12 -06:00
/// Logits Warper Parameters
LogitsWarperParameters parameters = 4;
2022-10-08 04:30:12 -06:00
/// Stopping criteria
uint32 max_new_tokens = 5;
2022-10-08 04:30:12 -06:00
}
message Batch {
/// Batch ID
uint64 id = 1;
/// Individual requests
repeated Request requests = 2;
/// Batch size (==len(requests))
uint32 size = 3;
/// Length of the longest sequence within the batch (used for padding)
uint32 max_sequence_length = 4;
2022-10-08 04:30:12 -06:00
}
message GeneratedText {
/// Request
Request request = 1;
2022-10-08 04:30:12 -06:00
/// Output
string output = 2;
}
message GenerateRequest {
/// Batch
Batch batch = 1;
2022-10-08 04:30:12 -06:00
}
message GenerateResponse {
/// Finished requests
repeated GeneratedText generated_texts = 1;
/// Next batch (cached)
optional Batch batch = 2;
2022-10-08 04:30:12 -06:00
}
message GenerateWithCacheRequest {
/// Cached batches
repeated Batch batches = 1;
}
2022-10-08 04:30:12 -06:00
message GenerateWithCacheResponse {
/// Finished requests
repeated GeneratedText generated_texts = 1;
/// Next batch (cached)
optional Batch batch = 2;
}
message GenerateUntilFinishedRequest {
/// Batch
Batch batch = 1;
}
message GenerateUntilFinishedResponse {
/// Finished requests
repeated GeneratedText generated_texts = 1;
/// Next batch (cached)
optional Batch batch = 2;
}
message GenerateUntilFinishedWithCacheRequest {
/// Cached batches
repeated Batch batches = 1;
}
message GenerateUntilFinishedWithCacheResponse {
/// Finished requests
repeated GeneratedText generated_texts = 1;
/// Next batch (cached)
optional Batch batch = 2;
}