From 3ece76392b460abe8bec736c499a2729d7e8dc2a Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 21 Aug 2024 09:03:28 +0200
Subject: [PATCH] Apply suggestions from code review

Co-authored-by: drbh <david.richard.holtz@gmail.com>
---
 launcher/src/main.rs                                   | 1 -
 server/text_generation_server/layers/attention/cuda.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index e16fa09d..627dbd14 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -1498,7 +1498,6 @@ fn main() -> Result<(), LauncherError> {
             let config: Config = config.into();
             match config.head_dim {
                 Some(h) if h == 64 || h == 128 || h == 256 => {
-                    // std::env::set_var("ATTENTION", "flashdecoding");
                     if args.lora_adapters.is_some() {
                         tracing::info!("Disabling prefix caching because of lora adapters");
                         std::env::set_var("USE_PREFIX_CACHING", "0");
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index 7c415804..40d71e2d 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -233,7 +233,6 @@ if ATTENTION == "flashinfer":
         causal=True,
         softcap=0.0,
     ):
-        # assert window_size_left == -1, "Windowing is not supported with flash infer"
         from text_generation_server.layers.attention.flashinfer import (
             prefill_with_paged_kv_state,
         )