hf_text-generation-inference/proto/generate.proto

syntax = "proto3";

package generate.v2;

service TextGenerationService {
    /// Model Info
    rpc Info (InfoRequest) returns (InfoResponse) {}
    /// Service discovery
    rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}
    /// Empties batch cache
    rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);
    /// Remove requests from a cached batch
    rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);
    /// Warmup the model and compute max cache size
    rpc Warmup (WarmupRequest) returns (WarmupResponse);
    /// Prefill batch and decode first token
    rpc Prefill (PrefillRequest) returns (PrefillResponse);
    /// Decode token for a list of prefilled batches
    rpc Decode (DecodeRequest) returns (DecodeResponse);
    /// Health check
    rpc Health (HealthRequest) returns (HealthResponse);
}

message HealthRequest {}
message HealthResponse {}

/// Empty request
message InfoRequest {}

message InfoResponse {
    bool requires_padding = 1;
    string dtype = 2;
    string device_type = 3;
    optional uint32 window_size = 4;
    uint32 speculate = 5;
}

/// Empty request
message ServiceDiscoveryRequest {}

message ServiceDiscoveryResponse {
    /// Other shards urls
    repeated string urls = 1;
}

message ClearCacheRequest {
    /// Optional batch id
    optional uint64 id = 1;
}

/// Empty response
message ClearCacheResponse {}

message NextTokenChooserParameters {
    /// exponential scaling output probability distribution
    float temperature = 1;
    /// restricting to the k highest probability elements
    uint32 top_k = 2;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float top_p = 3;
    /// restricting to top tokens summing to prob_cut_off <= prob_cut_off
    float typical_p = 4;
    /// apply sampling on the logits
    bool do_sample = 5;
    /// random seed for sampling
    uint64 seed = 6;
    /// repetition penalty
    float repetition_penalty = 7;
    /// token watermarking using "A Watermark for Large Language Models"
    bool watermark = 8;
}

message StoppingCriteriaParameters {
    /// Maximum number of generated tokens
    uint32 max_new_tokens = 1;
    /// Optional stopping sequences
    repeated string stop_sequences = 2;
    /// Ignore end of sequence token
    /// used for benchmarking
    bool ignore_eos_token = 3;
}

message Request {
    /// Request ID
    uint64 id = 1;
    /// The generation context
    string inputs = 2;
    /// Context truncation
    uint32 truncate = 3;
    /// Next Token Chooser Parameters
    NextTokenChooserParameters parameters = 4;
    /// Stopping Criteria Parameters
    StoppingCriteriaParameters stopping_parameters = 5;
    /// Return prefill logprobs
    bool prefill_logprobs = 6;
    /// Return most likely n tokens
    uint32 top_n_tokens = 7;
}

message Batch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests
    repeated Request requests = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

message CachedBatch {
    /// Batch ID
    uint64 id = 1;
    /// Individual requests ids
    repeated uint64 request_ids = 2;
    /// Batch size (==len(requests))
    uint32 size = 3;
    /// Maximum number of tokens this batch will grow to
    uint32 max_tokens = 4;
}

enum FinishReason {
    FINISH_REASON_LENGTH = 0;
    FINISH_REASON_EOS_TOKEN = 1;
    FINISH_REASON_STOP_SEQUENCE = 2;
}

message GeneratedText {
    /// Output
    string text = 1;
    /// Number of generated tokens
    uint32 generated_tokens = 2;
    /// Finish reason
    FinishReason finish_reason = 3;
    /// Seed
    optional uint64 seed = 4;
}

message Tokens {
    /// Token IDs
    repeated uint32 ids = 1;
    /// Logprobs
    repeated float logprobs = 2;
    /// tokens
    repeated string texts = 3;
    /// special
    repeated bool is_special = 4;
}

message Generation {
    /// Request ID
    uint64 request_id = 1;
    /// Prefill tokens (optional)
    Tokens prefill_tokens = 2;
    Tokens tokens = 3;
    /// Complete generated text
    optional GeneratedText generated_text = 4;
    /// Top tokens
    repeated Tokens top_tokens = 5;
}

message FilterBatchRequest {
    /// Batch ID
    uint64 batch_id = 1;
    /// Requests to keep
    repeated uint64 request_ids = 2;
}

message FilterBatchResponse {
    /// Filtered Batch (cached)
    CachedBatch batch = 1;
}


message PrefillRequest {
    /// Batch
    Batch batch = 1;
}

message PrefillResponse {
    /// Generation
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
}

message DecodeRequest {
    /// Cached batches
    repeated CachedBatch batches = 1;
}

message DecodeResponse {
    /// Decodes
    repeated Generation generations = 1;
    /// Next batch (cached)
    optional CachedBatch batch = 2;
    /// Forward elapsed time in nanoseconds
    uint64 forward_ns = 3;
    /// Decode elapsed time in nanoseconds
    uint64 decode_ns = 4;
    /// Total elapsed time in nanoseconds
    uint64 total_ns = 5;
    /// Concatenate elapsed time in nanoseconds
    optional uint64 concat_ns = 6;
}

message WarmupRequest {
    /// Batch to warmup on
    Batch batch = 1;
    uint32 max_input_length = 2;
    uint32 max_prefill_tokens = 3;
    uint32 max_total_tokens = 4;
}

/// Empty response
message WarmupResponse {
    /// Maximum number of tokens supported by the model
    optional uint32 max_supported_total_tokens = 1;
}
Init 2022-10-08 04:30:12 -06:00			`syntax = "proto3";`

Speculative (#1308) 2023-12-11 04:46:30 -07:00			`package generate.v2;`
Init 2022-10-08 04:30:12 -06:00
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`service TextGenerationService {`
feat(router): add device and dtype info (#215) 2023-04-21 07:36:29 -06:00			`/// Model Info`
			`rpc Info (InfoRequest) returns (InfoResponse) {}`
Init 2022-10-08 04:30:12 -06:00			`/// Service discovery`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`rpc ServiceDiscovery (ServiceDiscoveryRequest) returns (ServiceDiscoveryResponse) {}`
Init 2022-10-08 04:30:12 -06:00			`/// Empties batch cache`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`rpc ClearCache (ClearCacheRequest) returns (ClearCacheResponse);`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`/// Remove requests from a cached batch`
			`rpc FilterBatch (FilterBatchRequest) returns (FilterBatchResponse);`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 11:09:59 -06:00			`/// Warmup the model and compute max cache size`
			`rpc Warmup (WarmupRequest) returns (WarmupResponse);`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`/// Prefill batch and decode first token`
			`rpc Prefill (PrefillRequest) returns (PrefillResponse);`
			`/// Decode token for a list of prefilled batches`
			`rpc Decode (DecodeRequest) returns (DecodeResponse);`
feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 12:23:54 -06:00			`/// Health check`
			`rpc Health (HealthRequest) returns (HealthResponse);`
Init 2022-10-08 04:30:12 -06:00			`}`

feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 12:23:54 -06:00			`message HealthRequest {}`
			`message HealthResponse {}`

feat(router): add device and dtype info (#215) 2023-04-21 07:36:29 -06:00			`/// Empty request`
			`message InfoRequest {}`

			`message InfoResponse {`
			`bool requires_padding = 1;`
			`string dtype = 2;`
			`string device_type = 3;`
feat: add mistral model (#1071) 2023-09-28 01:55:47 -06:00			`optional uint32 window_size = 4;`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`uint32 speculate = 5;`
feat(router): add device and dtype info (#215) 2023-04-21 07:36:29 -06:00			`}`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Empty request`
			`message ServiceDiscoveryRequest {}`

Init 2022-10-08 04:30:12 -06:00			`message ServiceDiscoveryResponse {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Other shards urls`
Init 2022-10-08 04:30:12 -06:00			`repeated string urls = 1;`
			`}`

feat(server): clear cache on error (#143) 2023-03-28 03:29:35 -06:00			`message ClearCacheRequest {`
			`/// Optional batch id`
			`optional uint64 id = 1;`
			`}`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00
			`/// Empty response`
			`message ClearCacheResponse {}`

feat: Return logprobs (#8) 2022-12-15 09:03:56 -07:00			`message NextTokenChooserParameters {`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// exponential scaling output probability distribution`
Init 2022-10-08 04:30:12 -06:00			`float temperature = 1;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// restricting to the k highest probability elements`
Init 2022-10-08 04:30:12 -06:00			`uint32 top_k = 2;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// restricting to top tokens summing to prob_cut_off <= prob_cut_off`
Init 2022-10-08 04:30:12 -06:00			`float top_p = 3;`
feat: support typical sampling (#114) closes #112 2023-03-09 03:33:57 -07:00			`/// restricting to top tokens summing to prob_cut_off <= prob_cut_off`
			`float typical_p = 4;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// apply sampling on the logits`
feat: support typical sampling (#114) closes #112 2023-03-09 03:33:57 -07:00			`bool do_sample = 5;`
feat: Support sampling seeding (#37) Co-authored-by: Yannic Kilcher <yk@users.noreply.github.com> 2023-01-30 07:36:16 -07:00			`/// random seed for sampling`
feat: support typical sampling (#114) closes #112 2023-03-09 03:33:57 -07:00			`uint64 seed = 6;`
feat(server): support repetition penalty (#47) 2023-02-01 07:58:42 -07:00			`/// repetition penalty`
feat: support typical sampling (#114) closes #112 2023-03-09 03:33:57 -07:00			`float repetition_penalty = 7;`
feat(server): add logits watermark (#90) 2023-03-02 04:30:41 -07:00			`/// token watermarking using "A Watermark for Large Language Models"`
feat: support typical sampling (#114) closes #112 2023-03-09 03:33:57 -07:00			`bool watermark = 8;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`message StoppingCriteriaParameters {`
			`/// Maximum number of generated tokens`
			`uint32 max_new_tokens = 1;`
			`/// Optional stopping sequences`
			`repeated string stop_sequences = 2;`
feat(benchmark): tui based benchmarking tool (#149) 2023-03-30 07:26:27 -06:00			`/// Ignore end of sequence token`
			`/// used for benchmarking`
			`bool ignore_eos_token = 3;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`}`

Init 2022-10-08 04:30:12 -06:00			`message Request {`
			`/// Request ID`
			`uint64 id = 1;`
			`/// The generation context`
			`string inputs = 2;`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`/// Context truncation`
			`uint32 truncate = 3;`
feat: Return logprobs (#8) 2022-12-15 09:03:56 -07:00			`/// Next Token Chooser Parameters`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`NextTokenChooserParameters parameters = 4;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// Stopping Criteria Parameters`
feat(router): make router input validation optional (#164) 2023-04-09 12:22:27 -06:00			`StoppingCriteriaParameters stopping_parameters = 5;`
feat(server): only compute prefill logprobs when asked (#406) Close #288 2023-06-02 09:12:30 -06:00			`/// Return prefill logprobs`
			`bool prefill_logprobs = 6;`
Rebased #617 (#868) # What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> --------- Co-authored-by: Vincent Brouwers <vincent.brouwers@ing.com> 2023-08-28 03:43:47 -06:00			`/// Return most likely n tokens`
			`uint32 top_n_tokens = 7;`
Init 2022-10-08 04:30:12 -06:00			`}`

			`message Batch {`
			`/// Batch ID`
			`uint64 id = 1;`
			`/// Individual requests`
			`repeated Request requests = 2;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Batch size (==len(requests))`
			`uint32 size = 3;`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`/// Maximum number of tokens this batch will grow to`
			`uint32 max_tokens = 4;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`message CachedBatch {`
			`/// Batch ID`
			`uint64 id = 1;`
			`/// Individual requests ids`
			`repeated uint64 request_ids = 2;`
			`/// Batch size (==len(requests))`
			`uint32 size = 3;`
			`/// Maximum number of tokens this batch will grow to`
			`uint32 max_tokens = 4;`
			`}`

feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`enum FinishReason {`
			`FINISH_REASON_LENGTH = 0;`
			`FINISH_REASON_EOS_TOKEN = 1;`
			`FINISH_REASON_STOP_SEQUENCE = 2;`
			`}`

Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`message GeneratedText {`
Init 2022-10-08 04:30:12 -06:00			`/// Output`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`string text = 1;`
feat(server): Support generic AutoModelForCausalLM 2022-11-04 07:22:47 -06:00			`/// Number of generated tokens`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`uint32 generated_tokens = 2;`
feat: Support stop sequences (#7) 2022-12-12 10:25:22 -07:00			`/// Finish reason`
feat(router): refactor API and add openAPI schemas (#53) 2023-02-03 04:43:37 -07:00			`FinishReason finish_reason = 3;`
feat: Support sampling seeding (#37) Co-authored-by: Yannic Kilcher <yk@users.noreply.github.com> 2023-01-30 07:36:16 -07:00			`/// Seed`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`optional uint64 seed = 4;`
Init 2022-10-08 04:30:12 -06:00			`}`

Speculative (#1308) 2023-12-11 04:46:30 -07:00			`message Tokens {`
			`/// Token IDs`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`repeated uint32 ids = 1;`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`/// Logprobs`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`repeated float logprobs = 2;`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`/// tokens`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`repeated string texts = 3;`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`/// special`
			`repeated bool is_special = 4;`
Rebased #617 (#868) # What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> --------- Co-authored-by: Vincent Brouwers <vincent.brouwers@ing.com> 2023-08-28 03:43:47 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message Generation {`
			`/// Request ID`
			`uint64 request_id = 1;`
			`/// Prefill tokens (optional)`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`Tokens prefill_tokens = 2;`
			`Tokens tokens = 3;`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`/// Complete generated text`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`optional GeneratedText generated_text = 4;`
Rebased #617 (#868) # What does this PR do? <!-- Congratulations! You've made it this far! You're not quite done yet though. Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution. Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change. Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost. --> <!-- Remove if not applicable --> Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR. <!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @ @OlivierDehaene OR @Narsil --> --------- Co-authored-by: Vincent Brouwers <vincent.brouwers@ing.com> 2023-08-28 03:43:47 -06:00			`/// Top tokens`
Speculative (#1308) 2023-12-11 04:46:30 -07:00			`repeated Tokens top_tokens = 5;`
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`}`

feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`message FilterBatchRequest {`
			`/// Batch ID`
			`uint64 batch_id = 1;`
			`/// Requests to keep`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`repeated uint64 request_ids = 2;`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`}`

			`message FilterBatchResponse {`
			`/// Filtered Batch (cached)`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`CachedBatch batch = 1;`
feat(router): use number of tokens in batch as input for dynamic batching (#226) Co-authored-by: Nick Hill <nickhill@us.ibm.com> 2023-04-24 09:59:00 -06:00			`}`


feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message PrefillRequest {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Batch`
			`Batch batch = 1;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message PrefillResponse {`
			`/// Generation`
			`repeated Generation generations = 1;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Next batch (cached)`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`optional CachedBatch batch = 2;`
feat: add more latency metrics in forward (#1346) 2023-12-14 07:59:38 -07:00			`/// Forward elapsed time in nanoseconds`
			`uint64 forward_ns = 3;`
			`/// Decode elapsed time in nanoseconds`
			`uint64 decode_ns = 4;`
			`/// Total elapsed time in nanoseconds`
			`uint64 total_ns = 5;`
Init 2022-10-08 04:30:12 -06:00			`}`

feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message DecodeRequest {`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Cached batches`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`repeated CachedBatch batches = 1;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`}`
Init 2022-10-08 04:30:12 -06:00
feat: Add token streaming using ServerSideEvents support (#41) 2023-01-31 09:04:00 -07:00			`message DecodeResponse {`
			`/// Decodes`
			`repeated Generation generations = 1;`
Refactored gRPC interface Added validation logic 2022-10-11 08:50:54 -06:00			`/// Next batch (cached)`
feat: decrease IPC proto size (#367) Closes #307 #308 2023-05-24 11:19:57 -06:00			`optional CachedBatch batch = 2;`
feat: add more latency metrics in forward (#1346) 2023-12-14 07:59:38 -07:00			`/// Forward elapsed time in nanoseconds`
			`uint64 forward_ns = 3;`
			`/// Decode elapsed time in nanoseconds`
			`uint64 decode_ns = 4;`
			`/// Total elapsed time in nanoseconds`
			`uint64 total_ns = 5;`
			`/// Concatenate elapsed time in nanoseconds`
			`optional uint64 concat_ns = 6;`
feat(router): new healthcheck that skips the queue (#244) Co-authored-by: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Co-authored-by: OlivierDehaene <olivier@huggingface.co> 2023-04-26 12:23:54 -06:00			`}`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 11:09:59 -06:00
			`message WarmupRequest {`
			`/// Batch to warmup on`
			`Batch batch = 1;`
fix: fix gpt-q with groupsize = -1 (#1358) 2023-12-18 08:07:05 -07:00			`uint32 max_input_length = 2;`
			`uint32 max_prefill_tokens = 3;`
			`uint32 max_total_tokens = 4;`
feat(server): add paged attention to flash models (#516) Closes #478 2023-06-30 11:09:59 -06:00			`}`

			`/// Empty response`
feat(server): auto max_batch_total_tokens for flash att models (#630) 2023-07-19 01:31:25 -06:00			`message WarmupResponse {`
			`/// Maximum number of tokens supported by the model`
			`optional uint32 max_supported_total_tokens = 1;`
			`}`