Disable prefix caching for lora.
This commit is contained in:
parent
cba59aca03
commit
a6cd5fef23
|
@ -1500,6 +1500,10 @@ fn main() -> Result<(), LauncherError> {
|
||||||
match config.head_dim {
|
match config.head_dim {
|
||||||
Some(h) if h == 64 || h == 128 || h == 256 => {
|
Some(h) if h == 64 || h == 128 || h == 256 => {
|
||||||
// std::env::set_var("ATTENTION", "flashdecoding");
|
// std::env::set_var("ATTENTION", "flashdecoding");
|
||||||
|
if args.lora_adapters.is_some() {
|
||||||
|
tracing::info!("Disabling prefix caching because of lora adapters");
|
||||||
|
std::env::set_var("USE_PREFIX_CACHING", "0");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching");
|
tracing::info!("Forcing flash decoding because head dim is not supported by flashinfer, also disabling prefix caching");
|
||||||
|
|
Loading…
Reference in New Issue