Allowing window_left_size (dummy version).

This commit is contained in:
Nicolas Patry 2024-08-17 12:04:21 +02:00
parent f2bdc65098
commit f55278de2d
No known key found for this signature in database
GPG Key ID: 64AF4752B2967863
2 changed files with 11 additions and 4 deletions

View File

@ -16,6 +16,11 @@ pub struct RadixAllocator {
/// Blocks that are immediately available for allocation.
free_blocks: Vec<u32>,
#[allow(dead_code)]
// This isn't used because the prefix need to match without the windowing
// mecanism. This at worst is overallocating, not necessarily being wrong.
window_size: Option<u32>,
}
impl RadixAllocator {
@ -25,9 +30,9 @@ impl RadixAllocator {
"Radix tree allocator only works with block_size=1, was: {}",
block_size
);
if window_size.is_some() {
unimplemented!("Window size not supported in the prefix-caching block allocator yet");
}
// if window_size.is_some() {
// unimplemented!("Window size not supported in the prefix-caching block allocator yet");
// }
RadixAllocator {
allocation_id: 0,
@ -36,6 +41,7 @@ impl RadixAllocator {
// Block 0 is reserved for health checks.
free_blocks: (1..n_blocks).collect(),
window_size,
}
}

View File

@ -233,7 +233,7 @@ if ATTENTION == "flashinfer":
causal=True,
softcap=0.0,
):
assert window_size_left == -1, "Windowing is not supported with flash infer"
# assert window_size_left == -1, "Windowing is not supported with flash infer"
from text_generation_server.layers.attention.flashinfer import (
prefill_with_paged_kv_state,
)
@ -244,6 +244,7 @@ if ATTENTION == "flashinfer":
paged_kv_cache=(key_cache, value_cache),
logits_soft_cap=softcap,
sm_scale=softmax_scale,
window_left=window_size_left,
)
elif V2: