Fix cache block size for flash decoding (#2351)
* Fix cache block size for flash decoding This seems to have been accidentally dropped during the TRT-LLM PR rebase. * Also run CI on changes to `backends`
This commit is contained in:
parent
9ab9937414
commit
22fb1be588
|
@ -10,6 +10,7 @@ on:
|
|||
paths:
|
||||
- ".github/workflows/build.yaml"
|
||||
- "integration-tests/**"
|
||||
- "backends/**"
|
||||
- "server/**"
|
||||
- "proto/**"
|
||||
- "router/**"
|
||||
|
|
|
@ -35,9 +35,16 @@ impl BackendV3 {
|
|||
window_size: Option<u32>,
|
||||
speculate: u32,
|
||||
) -> Self {
|
||||
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
|
||||
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
|
||||
} else {
|
||||
false
|
||||
};
|
||||
let block_size = if flashdecoding { 256 } else { 16 };
|
||||
|
||||
let queue = Queue::new(
|
||||
requires_padding,
|
||||
16,
|
||||
block_size,
|
||||
window_size,
|
||||
speculate,
|
||||
max_batch_total_tokens,
|
||||
|
|
Loading…
Reference in New Issue