Disable prefix caching for lora.

2024-08-20 09:14:57 +02:00 · 2024-08-20 09:14:57 +02:00 · a6cd5fef23
parent cba59aca03
commit a6cd5fef23
1 changed files with 4 additions and 0 deletions
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@ -1500,6 +1500,10 @@ fn main() -> Result<(), LauncherError> {
            match config.head_dim {
                Some(h) if h == 64 || h == 128 || h == 256 => {
                    // std::env::set_var("ATTENTION", "flashdecoding");
                    if args.lora_adapters.is_some() {
                        tracing::info!("Disabling prefix caching because of lora adapters");
                        std::env::set_var("USE_PREFIX_CACHING", "0");
                    }
                }
                _ => {
                    tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching");