Disable prefix caching for lora.

This commit is contained in:
Nicolas Patry 2024-08-20 09:14:57 +02:00
parent cba59aca03
commit a6cd5fef23
No known key found for this signature in database
GPG Key ID: 64AF4752B2967863
1 changed files with 4 additions and 0 deletions

View File

@ -1500,6 +1500,10 @@ fn main() -> Result<(), LauncherError> {
match config.head_dim {
Some(h) if h == 64 || h == 128 || h == 256 => {
// std::env::set_var("ATTENTION", "flashdecoding");
if args.lora_adapters.is_some() {
tracing::info!("Disabling prefix caching because of lora adapters");
std::env::set_var("USE_PREFIX_CACHING", "0");
}
}
_ => {
tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching");