From 3ece76392b460abe8bec736c499a2729d7e8dc2a Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 21 Aug 2024 09:03:28 +0200 Subject: [PATCH] Apply suggestions from code review Co-authored-by: drbh --- launcher/src/main.rs | 1 - server/text_generation_server/layers/attention/cuda.py | 1 - 2 files changed, 2 deletions(-) diff --git a/launcher/src/main.rs b/launcher/src/main.rs index e16fa09d..627dbd14 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -1498,7 +1498,6 @@ fn main() -> Result<(), LauncherError> { let config: Config = config.into(); match config.head_dim { Some(h) if h == 64 || h == 128 || h == 256 => { - // std::env::set_var("ATTENTION", "flashdecoding"); if args.lora_adapters.is_some() { tracing::info!("Disabling prefix caching because of lora adapters"); std::env::set_var("USE_PREFIX_CACHING", "0"); diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index 7c415804..40d71e2d 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -233,7 +233,6 @@ if ATTENTION == "flashinfer": causal=True, softcap=0.0, ): - # assert window_size_left == -1, "Windowing is not supported with flash infer" from text_generation_server.layers.attention.flashinfer import ( prefill_with_paged_kv_state, )