Allowing window_left_size (dummy version).
This commit is contained in:
parent
f2bdc65098
commit
f55278de2d
|
@ -16,6 +16,11 @@ pub struct RadixAllocator {
|
||||||
|
|
||||||
/// Blocks that are immediately available for allocation.
|
/// Blocks that are immediately available for allocation.
|
||||||
free_blocks: Vec<u32>,
|
free_blocks: Vec<u32>,
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
// This isn't used because the prefix need to match without the windowing
|
||||||
|
// mecanism. This at worst is overallocating, not necessarily being wrong.
|
||||||
|
window_size: Option<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RadixAllocator {
|
impl RadixAllocator {
|
||||||
|
@ -25,9 +30,9 @@ impl RadixAllocator {
|
||||||
"Radix tree allocator only works with block_size=1, was: {}",
|
"Radix tree allocator only works with block_size=1, was: {}",
|
||||||
block_size
|
block_size
|
||||||
);
|
);
|
||||||
if window_size.is_some() {
|
// if window_size.is_some() {
|
||||||
unimplemented!("Window size not supported in the prefix-caching block allocator yet");
|
// unimplemented!("Window size not supported in the prefix-caching block allocator yet");
|
||||||
}
|
// }
|
||||||
|
|
||||||
RadixAllocator {
|
RadixAllocator {
|
||||||
allocation_id: 0,
|
allocation_id: 0,
|
||||||
|
@ -36,6 +41,7 @@ impl RadixAllocator {
|
||||||
|
|
||||||
// Block 0 is reserved for health checks.
|
// Block 0 is reserved for health checks.
|
||||||
free_blocks: (1..n_blocks).collect(),
|
free_blocks: (1..n_blocks).collect(),
|
||||||
|
window_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -233,7 +233,7 @@ if ATTENTION == "flashinfer":
|
||||||
causal=True,
|
causal=True,
|
||||||
softcap=0.0,
|
softcap=0.0,
|
||||||
):
|
):
|
||||||
assert window_size_left == -1, "Windowing is not supported with flash infer"
|
# assert window_size_left == -1, "Windowing is not supported with flash infer"
|
||||||
from text_generation_server.layers.attention.flashinfer import (
|
from text_generation_server.layers.attention.flashinfer import (
|
||||||
prefill_with_paged_kv_state,
|
prefill_with_paged_kv_state,
|
||||||
)
|
)
|
||||||
|
@ -244,6 +244,7 @@ if ATTENTION == "flashinfer":
|
||||||
paged_kv_cache=(key_cache, value_cache),
|
paged_kv_cache=(key_cache, value_cache),
|
||||||
logits_soft_cap=softcap,
|
logits_soft_cap=softcap,
|
||||||
sm_scale=softmax_scale,
|
sm_scale=softmax_scale,
|
||||||
|
window_left=window_size_left,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif V2:
|
elif V2:
|
||||||
|
|
Loading…
Reference in New Issue