Fix cache block size for flash decoding (#2351)

* Fix cache block size for flash decoding

This seems to have been accidentally dropped during the TRT-LLM
PR rebase.

* Also run CI on changes to `backends`
This commit is contained in:
Daniël de Kok 2024-08-01 15:38:57 +02:00 committed by GitHub
parent 9ab9937414
commit 22fb1be588
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 1 deletions

View File

@ -10,6 +10,7 @@ on:
paths:
- ".github/workflows/build.yaml"
- "integration-tests/**"
- "backends/**"
- "server/**"
- "proto/**"
- "router/**"

View File

@ -35,9 +35,16 @@ impl BackendV3 {
window_size: Option<u32>,
speculate: u32,
) -> Self {
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
} else {
false
};
let block_size = if flashdecoding { 256 } else { 16 };
let queue = Queue::new(
requires_padding,
16,
block_size,
window_size,
speculate,
max_batch_total_tokens,