Fix cache block size for flash decoding (#2351)
* Fix cache block size for flash decoding This seems to have been accidentally dropped during the TRT-LLM PR rebase. * Also run CI on changes to `backends`
This commit is contained in:
parent
9ab9937414
commit
22fb1be588
|
@ -10,6 +10,7 @@ on:
|
||||||
paths:
|
paths:
|
||||||
- ".github/workflows/build.yaml"
|
- ".github/workflows/build.yaml"
|
||||||
- "integration-tests/**"
|
- "integration-tests/**"
|
||||||
|
- "backends/**"
|
||||||
- "server/**"
|
- "server/**"
|
||||||
- "proto/**"
|
- "proto/**"
|
||||||
- "router/**"
|
- "router/**"
|
||||||
|
|
|
@ -35,9 +35,16 @@ impl BackendV3 {
|
||||||
window_size: Option<u32>,
|
window_size: Option<u32>,
|
||||||
speculate: u32,
|
speculate: u32,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
|
let flashdecoding = if let Ok(flashdecoding) = std::env::var("FLASH_DECODING") {
|
||||||
|
matches!(flashdecoding.to_lowercase().as_str(), "1" | "true")
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
};
|
||||||
|
let block_size = if flashdecoding { 256 } else { 16 };
|
||||||
|
|
||||||
let queue = Queue::new(
|
let queue = Queue::new(
|
||||||
requires_padding,
|
requires_padding,
|
||||||
16,
|
block_size,
|
||||||
window_size,
|
window_size,
|
||||||
speculate,
|
speculate,
|
||||||
max_batch_total_tokens,
|
max_batch_total_tokens,
|
||||||
|
|
Loading…
Reference in New Issue