diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index b4fc86b7..ac1469f6 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -333,6 +333,16 @@ struct Args {
     #[clap(long, env)]
     rope_factor: Option<f32>,
 
+    /// Sliding Window will only be used by flash attention optimized models
+    /// Limit the Paged Attention context window size
+    #[clap(long, env)]
+    sliding_window: Option<u32>,
+
+    /// If `sliding_window` is set, always keep the first `attention_sinks` tokens in the context
+    /// See: [Efficient Streaming Language Models with Attention Sinks](https://arxiv.org/abs/2309.17453)
+    #[clap(long, env)]
+    attention_sinks: Option<u32>,
+
     /// Outputs the logs in JSON format (useful for telemetry)
     #[clap(long, env)]
     json_output: bool,
@@ -390,6 +400,8 @@ fn shard_manager(
     cuda_memory_fraction: f32,
     rope_scaling: Option<RopeScaling>,
     rope_factor: Option<f32>,
+    sliding_window: Option<u32>,
+    attention_sinks: Option<u32>,
     otlp_endpoint: Option<String>,
     status_sender: mpsc::Sender<ShardStatus>,
     shutdown: Arc<AtomicBool>,
@@ -495,6 +507,17 @@ fn shard_manager(
         envs.push(("ROPE_FACTOR".into(), factor.to_string().into()));
     }
 
+    // Detect sliding window
+    // Sending as env instead of CLI args to not bloat everything
+    // those only can be used by flash attention models, so passing information around
+    // for all models will complexify code unnecessarily
+    if let Some(sliding_window) = sliding_window {
+        envs.push(("SLIDING_WINDOW".into(), sliding_window.to_string().into()));
+    }
+    if let Some(attention_sinks) = attention_sinks {
+        envs.push(("ATTENTION_SINKS".into(), attention_sinks.to_string().into()));
+    }
+
     // If huggingface_hub_cache is some, pass it to the shard
     // Useful when running inside a docker container
     if let Some(huggingface_hub_cache) = huggingface_hub_cache {
@@ -891,6 +914,8 @@ fn spawn_shards(
         let cuda_memory_fraction = args.cuda_memory_fraction;
         let rope_scaling = args.rope_scaling;
         let rope_factor = args.rope_factor;
+        let sliding_window = args.sliding_window;
+        let attention_sinks = args.attention_sinks;
         thread::spawn(move || {
             shard_manager(
                 model_id,
@@ -911,6 +936,8 @@ fn spawn_shards(
                 cuda_memory_fraction,
                 rope_scaling,
                 rope_factor,
+                sliding_window,
+                attention_sinks,
                 otlp_endpoint,
                 status_sender,
                 shutdown,
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 5b1b5715..4ee0cd00 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -297,7 +297,7 @@ def get_model(
         raise ValueError("awq quantization is not supported for AutoModel")
     elif (quantize == "bitsandbytes-fp4") or (quantize == "bitsandbytes-nf4"):
         raise ValueError("4bit quantization is not supported for AutoModel")
-    elif (quantize == "eetq"):
+    elif quantize == "eetq":
         raise ValueError("Eetq quantization is not supported for AutoModel")
     if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
         return CausalLM(
diff --git a/server/text_generation_server/models/bloom.py b/server/text_generation_server/models/bloom.py
index 8e8daad3..c3876023 100644
--- a/server/text_generation_server/models/bloom.py
+++ b/server/text_generation_server/models/bloom.py
@@ -74,7 +74,11 @@ class BLOOMSharded(CausalLM):
         torch.distributed.barrier(group=self.process_group)
         filenames = weight_files(model_id, revision=revision, extension=".safetensors")
         weights = Weights(
-            filenames, device=device, dtype=dtype, process_group=self.process_group, prefix="transformer",
+            filenames,
+            device=device,
+            dtype=dtype,
+            process_group=self.process_group,
+            prefix="transformer",
         )
         if config.quantize == "gptq":
             weights._set_gptq_params(model_id)
diff --git a/server/text_generation_server/models/cache_manager.py b/server/text_generation_server/models/cache_manager.py
index 2e6ae086..35549f51 100644
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@@ -15,12 +15,14 @@ class CacheManager:
         num_layers: int,
         num_heads: int,
         head_size: int,
+        attention_sinks: int,
         repeat_slots: bool,
         dtype: torch.dtype,
         device: torch.device,
     ):
         self.block_size = BLOCK_SIZE
         self.num_blocks = num_blocks
+        self.attention_sinks = attention_sinks
         self.repeat_slots = repeat_slots
 
         element_size = torch.tensor([], dtype=dtype).element_size()
@@ -82,8 +84,23 @@ class CacheManager:
 
             # Repeat slots in the case of context sliding window
             if needed_slots > len(all_slots) and self.repeat_slots:
-                repeats = math.ceil(needed_slots / len(all_slots))
-                all_slots = all_slots.repeat(repeats)
+                repeats = math.ceil(
+                    needed_slots / (len(all_slots) - self.attention_sinks)
+                )
+
+                if self.attention_sinks > 0:
+                    # Remove attention sinks from the repeat to not override them
+                    all_slots = torch.cat(
+                        [
+                            all_slots,
+                            all_slots[self.attention_sinks :].repeat(repeats - 1),
+                        ]
+                    )
+                else:
+                    all_slots = all_slots.repeat(repeats)
+
+            elif needed_slots > len(all_slots):
+                raise RuntimeError("Out of available slots. This is a bug")
 
             allocated_slots = all_slots[:needed_slots]
 
@@ -112,6 +129,7 @@ def set_cache_manager(
     num_layers: int,
     num_heads: int,
     head_size: int,
+    attention_sinks: int,
     repeat_slots: bool,
     dtype: torch.dtype,
     device: torch.device,
@@ -122,7 +140,14 @@ def set_cache_manager(
         torch.cuda.empty_cache()
 
     CACHE_MANAGER = CacheManager(
-        num_blocks, num_layers, num_heads, head_size, repeat_slots, dtype, device
+        num_blocks,
+        num_layers,
+        num_heads,
+        head_size,
+        attention_sinks,
+        repeat_slots,
+        dtype,
+        device,
     )
     return CACHE_MANAGER
 
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index 7c743a88..ff191ee5 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -254,6 +254,7 @@ class FlashLlamaAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         qkv = self.query_key_value(hidden_states)
         query, kv = qkv.split(
@@ -269,8 +270,13 @@ class FlashLlamaAttention(torch.nn.Module):
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
 
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
         vllm_cache_ops.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
         )
 
         # output tensor
@@ -376,6 +382,7 @@ class FlashLlamaLayer(nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -390,6 +397,7 @@ class FlashLlamaLayer(nn.Module):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
 
         # faster post attention rms norm
@@ -442,6 +450,7 @@ class FlashLlamaModel(torch.nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
@@ -464,6 +473,7 @@ class FlashLlamaModel(torch.nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -492,8 +502,19 @@ class FlashLlamaForCausalLM(torch.nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
         lm_head_indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif sliding_window != -1:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            max_s = min(sliding_window, max_s)
+            input_lengths = torch.clamp(input_lengths, max=sliding_window)
+
         hidden_states = self.model(
             input_ids,
             position_ids,
@@ -503,6 +524,7 @@ class FlashLlamaForCausalLM(torch.nn.Module):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index 77b7f230..b21b730b 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -201,9 +201,6 @@ class MistralAttention(torch.nn.Module):
         weights,
     ):
         super().__init__()
-        self.max_past = (
-            config.sliding_window if config.sliding_window is not None else 0
-        )
         self.num_heads = config.num_attention_heads
         self.hidden_size = config.hidden_size
         self.head_size = self.hidden_size // self.num_heads
@@ -252,6 +249,7 @@ class MistralAttention(torch.nn.Module):
         input_lengths,
         max_s,
         prefill_cache_indices,
+        sliding_window,
     ):
         qkv = self.query_key_value(hidden_states)
         query, kv = qkv.split(
@@ -290,7 +288,7 @@ class MistralAttention(torch.nn.Module):
                 cu_seqlen_prefill,
                 max_s,
                 self.softmax_scale,
-                window_size_left=self.max_past,
+                window_size_left=sliding_window,
             )
         # Decode
         else:
@@ -381,6 +379,7 @@ class MistralLayer(nn.Module):
         input_lengths,
         max_s,
         prefill_cache_indices,
+        sliding_window,
     ):
         normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
 
@@ -396,6 +395,7 @@ class MistralLayer(nn.Module):
             input_lengths,
             max_s,
             prefill_cache_indices,
+            sliding_window,
         )
 
         # faster post attention rms norm
@@ -449,6 +449,7 @@ class MistralModel(torch.nn.Module):
         input_lengths: torch.Tensor,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
     ) -> torch.Tensor:
         hidden_states = self.embed_tokens(input_ids)
 
@@ -472,6 +473,7 @@ class MistralModel(torch.nn.Module):
                 input_lengths,
                 max_s,
                 prefill_cache_indices,
+                sliding_window,
             )
 
         hidden_states, _ = self.norm(hidden_states, residual)
@@ -489,9 +491,6 @@ class FlashMistralForCausalLM(torch.nn.Module):
             prefix="lm_head",
             weights=weights,
         )
-        self.max_past = config.sliding_window
-        if self.max_past is None:
-            raise ValueError("max_past cannot be None")
 
     def forward(
         self,
@@ -504,16 +503,17 @@ class FlashMistralForCausalLM(torch.nn.Module):
         input_lengths: torch.Tensor,
         max_s: int,
         prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
         lm_head_indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if prefill_cache_indices is not None:
             # Slots also need to be sliced as it has the same size as the whole kv tensor
             slots = slots[prefill_cache_indices]
-        else:
+        elif sliding_window != -1:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            max_s = min(self.max_past, max_s)
-            input_lengths = torch.clamp(input_lengths, max=self.max_past)
+            max_s = min(sliding_window, max_s)
+            input_lengths = torch.clamp(input_lengths, max=sliding_window)
 
         hidden_states = self.model(
             input_ids,
@@ -525,6 +525,7 @@ class FlashMistralForCausalLM(torch.nn.Module):
             input_lengths,
             max_s,
             prefill_cache_indices,
+            sliding_window,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index 9dc374df..abc66179 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -133,6 +133,7 @@ class FlashNeoxAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         qkv = self.query_key_value(hidden_states)
         qkv = qkv.view(-1, 3, self.num_heads, self.head_size)
@@ -141,20 +142,28 @@ class FlashNeoxAttention(torch.nn.Module):
         self.rotary_emb(qkv[:, 0], cos, sin)
         self.rotary_emb(qkv[:, 1], cos, sin)
 
+        query, kv = qkv.split([1, 2], dim=1)
+        query = query.view(-1, self.num_heads, self.head_size)
+
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
         vllm_cache_ops.reshape_and_cache(
-            qkv[:, 1], qkv[:, 2], kv_cache[0], kv_cache[1], slots
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
         )
 
         # output tensor
-        attn_output = torch.empty_like(qkv[:, 0])
+        attn_output = torch.empty_like(query)
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attention(
-                qkv[:, 0],
-                qkv[:, 1],
-                qkv[:, 2],
+                query,
+                kv[:, 0],
+                kv[:, 1],
                 attn_output,
                 cu_seqlen_prefill,
                 max_s,
@@ -166,7 +175,7 @@ class FlashNeoxAttention(torch.nn.Module):
             block_size = kv_cache[1].shape[3]
             vllm_attention_ops.single_query_cached_kv_attention(
                 attn_output,
-                qkv[:, 0],
+                query,
                 kv_cache[0],
                 kv_cache[1],
                 self.kv_head_mapping,
@@ -245,6 +254,7 @@ class FlashNeoXLayer(nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         if self.use_parallel_residual:
             ln1_hidden_states, _ = self.input_layernorm(hidden_states)
@@ -259,6 +269,7 @@ class FlashNeoXLayer(nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
             ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)
@@ -283,6 +294,7 @@ class FlashNeoXLayer(nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
             hidden_states, residual = self.post_attention_layernorm(
@@ -337,6 +349,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
     ) -> torch.Tensor:
         hidden_states = self.embed_in(input_ids)
 
@@ -359,6 +372,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
         hidden_states, _ = self.final_layer_norm(hidden_states, residual)
@@ -385,8 +399,19 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
         lm_head_indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif sliding_window != -1:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            max_s = min(sliding_window, max_s)
+            input_lengths = torch.clamp(input_lengths, max=sliding_window)
+
         hidden_states = self.gpt_neox(
             input_ids,
             position_ids,
@@ -396,6 +421,7 @@ class FlashGPTNeoXForCausalLM(FlashGPTNeoXPreTrainedModel):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index 8419fa4f..89a15c5d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -174,6 +174,7 @@ class FlashRWAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         qkv = self.query_key_value(hidden_states)
 
@@ -191,8 +192,13 @@ class FlashRWAttention(torch.nn.Module):
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(torch.select(kv, dim=1, index=0), cos, sin)
 
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
         vllm_cache_ops.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
         )
 
         # output
@@ -294,6 +300,7 @@ class FlashRWLargeAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         qkv = self.query_key_value(hidden_states)
         qkv = qkv.view(-1, self.num_groups, self.num_heads + 2, self.head_size)
@@ -310,9 +317,14 @@ class FlashRWLargeAttention(torch.nn.Module):
         self.rotary_emb(query, cos, sin)
         self.rotary_emb(torch.select(kv, dim=2, index=0), cos, sin)
 
+        if prefill_cache_indices is not None:
+            kv_to_cache = kv[prefill_cache_indices]
+        else:
+            kv_to_cache = kv
+
         vllm_cache_ops.reshape_and_cache(
-            kv[:, :, 0].contiguous(),
-            kv[:, :, 1].contiguous(),
+            kv_to_cache[:, :, 0].contiguous(),
+            kv_to_cache[:, :, 1].contiguous(),
             kv_cache[0],
             kv_cache[1],
             slots,
@@ -428,6 +440,7 @@ class FlashRWLayer(nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         if self.parallel_attn:
             ln_hidden_states, residual = self.input_layernorm(hidden_states, residual)
@@ -442,6 +455,7 @@ class FlashRWLayer(nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
             mlp_output = self.mlp(ln_hidden_states)
@@ -464,6 +478,7 @@ class FlashRWLayer(nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
             hidden_states, residual = self.post_attention_layernorm(
@@ -513,6 +528,7 @@ class FlashRWLargeLayer(nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         ln_attn, residual = self.ln_attn(hidden_states, residual)
         ln_mlp, _ = self.ln_mlp(residual)
@@ -528,6 +544,7 @@ class FlashRWLargeLayer(nn.Module):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
 
         # MLP.
@@ -589,6 +606,7 @@ class FlashRWModel(FlashRWPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices,
     ) -> torch.Tensor:
         hidden_states = self.word_embeddings(input_ids)
 
@@ -611,6 +629,7 @@ class FlashRWModel(FlashRWPreTrainedModel):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
@@ -638,8 +657,19 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
         lm_head_indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif sliding_window != -1:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            max_s = min(sliding_window, max_s)
+            input_lengths = torch.clamp(input_lengths, max=sliding_window)
+
         hidden_states = self.transformer(
             input_ids,
             position_ids,
@@ -649,6 +679,7 @@ class FlashRWForCausalLM(FlashRWPreTrainedModel):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 2dd0a5ee..3a61d058 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -246,6 +246,7 @@ class FlashMQAttention(torch.nn.Module):
         slots,
         input_lengths,
         max_s,
+        prefill_cache_indices,
     ):
         qkv = self.c_attn(hidden_states)
 
@@ -258,8 +259,13 @@ class FlashMQAttention(torch.nn.Module):
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
+        if prefill_cache_indices is not None:
+            kv_to_cache = key_value[prefill_cache_indices]
+        else:
+            kv_to_cache = key_value
+
         vllm_cache_ops.reshape_and_cache(
-            key_value[:, 0], key_value[:, 1], kv_cache[0], kv_cache[1], slots
+            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
         )
 
         # output
@@ -367,6 +373,7 @@ class Block(nn.Module):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
 
         hidden_states, residual = self.ln_2(hidden_states, residual)
@@ -420,6 +427,7 @@ class FlashSantacoderModel(nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices,
     ) -> torch.Tensor:
         hidden_states = self.wte(input_ids) + self.wpe(position_ids)
 
@@ -437,6 +445,7 @@ class FlashSantacoderModel(nn.Module):
                 slots,
                 input_lengths,
                 max_s,
+                prefill_cache_indices,
             )
 
         hidden_states, _ = self.ln_f(hidden_states, residual)
@@ -462,8 +471,19 @@ class FlashSantacoderForCausalLM(nn.Module):
         slots: torch.Tensor,
         input_lengths: torch.Tensor,
         max_s: int,
+        prefill_cache_indices: Optional[torch.Tensor],
+        sliding_window: int,
         lm_head_indices: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if prefill_cache_indices is not None:
+            # Slots also need to be sliced as it has the same size as the whole kv tensor
+            slots = slots[prefill_cache_indices]
+        elif sliding_window != -1:
+            # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
+            # kernel requires the true values
+            max_s = min(sliding_window, max_s)
+            input_lengths = torch.clamp(input_lengths, max=sliding_window)
+
         hidden_states = self.transformer(
             input_ids,
             position_ids,
@@ -473,6 +493,7 @@ class FlashSantacoderForCausalLM(nn.Module):
             slots,
             input_lengths,
             max_s,
+            prefill_cache_indices,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
index 21aa3ff3..0a6f24e0 100644
--- a/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
+++ b/server/text_generation_server/models/custom_modeling/idefics_image_processing.py
@@ -198,7 +198,9 @@ class IdeficsImageProcessor(BaseImageProcessor):
             image = image_url_or_urls
 
             if image.startswith("http://") or image.startswith("https://"):
-                response = requests.get(image_url_or_urls, stream=True, headers=headers, timeout=(1, 5))
+                response = requests.get(
+                    image_url_or_urls, stream=True, headers=headers, timeout=(1, 5)
+                )
                 response.raise_for_status()
                 content = response.content
             else:
@@ -208,7 +210,7 @@ class IdeficsImageProcessor(BaseImageProcessor):
                 image = Image.open(BytesIO(content))
                 # image.verify()
             except Exception:
-                raise ValueError(f"Could not load image from url {image_url_or_urls}")    
+                raise ValueError(f"Could not load image from url {image_url_or_urls}")
             return image
         else:
             raise ValueError(
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 1fe40c0c..09bb078b 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -9,7 +9,7 @@ import numpy as np
 from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import PreTrainedTokenizerBase
-from typing import Optional, Tuple, List, Type, Union, Dict
+from typing import Optional, Tuple, List, Type, Dict
 
 from text_generation_server.models import Model
 from text_generation_server.models.types import (
@@ -24,6 +24,10 @@ from text_generation_server.models.cache_manager import (
     set_cache_manager,
     BLOCK_SIZE,
 )
+from text_generation_server.models.sliding_window import (
+    set_sliding_window_from_env,
+    get_sliding_window,
+)
 from text_generation_server.pb import generate_pb2
 from text_generation_server.utils import StoppingCriteria, HeterogeneousNextTokenChooser
 from text_generation_server.utils.dist import MEMORY_FRACTION
@@ -47,6 +51,12 @@ class FlashCausalLMBatch(Batch):
     # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
     cu_seqlen_prefill: Optional[torch.Tensor]
 
+    # Sliding window values
+
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
+
     # Paged Attention values
 
     # Set when creating the batch
@@ -109,6 +119,8 @@ class FlashCausalLMBatch(Batch):
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
+        sliding_window = get_sliding_window()
+
         batch_inputs = []
         max_truncation = 0
         for r in pb.requests:
@@ -124,6 +136,7 @@ class FlashCausalLMBatch(Batch):
         needed_blocks_slots = []
         start_slots = []
         slot_indices = []
+        prefill_cache_indices = []
 
         input_lengths = []
         prefix_offsets = []
@@ -187,8 +200,15 @@ class FlashCausalLMBatch(Batch):
             # Paged attention
             # Remove one as the first token des not have a past
             total_tokens = input_length + max_new_tokens - 1
+
             needed_blocks = math.ceil(total_tokens / BLOCK_SIZE)
+
+            # If using sliding window
+            if sliding_window is not None:
+                # Needed blocks can not go over SLIDING_WINDOW_BLOCKS
+                needed_blocks = min(needed_blocks, sliding_window.blocks)
             blocks += needed_blocks
+
             needed_blocks_slots.append((needed_blocks, total_tokens))
             start_slots.append(cumulative_max_length)
 
@@ -199,6 +219,32 @@ class FlashCausalLMBatch(Batch):
             )
             slot_indices.append(request_slot_indices)
 
+            # If using sliding window
+            if sliding_window is not None:
+                # Start of the sliding window cache
+                start_offset = max(
+                    0,
+                    input_length - sliding_window.size + sliding_window.attention_sinks,
+                )
+
+                if sliding_window.attention_sinks > 0 and start_offset > 0:
+                    # Attention sinks indices
+                    request_attention_sinks_cache_indices = torch.arange(
+                        cumulative_length,
+                        cumulative_length
+                        + min(sliding_window.attention_sinks, start_offset),
+                        dtype=torch.int64,
+                    )
+                    prefill_cache_indices.append(request_attention_sinks_cache_indices)
+
+                # Create tensor to slice into the kv tensor in prefill
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + start_offset,
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+                prefill_cache_indices.append(request_prefill_cache_indices)
+
             all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
             no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
 
@@ -252,12 +298,26 @@ class FlashCausalLMBatch(Batch):
             position_ids = position_ids[0]
             slot_indices = slot_indices[0]
 
+        if len(prefill_cache_indices) > 1:
+            prefill_cache_indices = (
+                torch.cat(prefill_cache_indices) if prefill_cache_indices else None
+            )
+        else:
+            prefill_cache_indices = (
+                prefill_cache_indices[0] if prefill_cache_indices else None
+            )
+
         cu_seqlen_prefill = torch.tensor(
             cu_seqlen_prefill, device=device, dtype=torch.int32
         )
 
         position_ids = position_ids.to(device)
         slot_indices = slot_indices.to(device)
+        prefill_cache_indices = (
+            prefill_cache_indices.to(device)
+            if prefill_cache_indices is not None
+            else None
+        )
         input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
         input_lengths_tensor = torch.tensor(
             input_lengths, dtype=torch.int32, device=device
@@ -309,6 +369,7 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens_tensor=top_n_tokens_tensor,
             blocks=blocks,
             max_blocks=max_blocks,
+            prefill_cache_indices=prefill_cache_indices,
         )
 
     @tracer.start_as_current_span("filter")
@@ -425,7 +486,7 @@ class FlashCausalLMBatch(Batch):
         # Move to GPU now that we have the whole tensor
         slot_indices = slot_indices.to(device)
 
-        return type(self)(
+        return FlashCausalLMBatch(
             batch_id=self.batch_id,
             requests=requests,
             requests_idx_mapping=requests_idx_mapping,
@@ -454,6 +515,7 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens_tensor=top_n_tokens_tensor,
             blocks=blocks,
             max_blocks=max_blocks,
+            prefill_cache_indices=None,
         )
 
     @classmethod
@@ -611,6 +673,7 @@ class FlashCausalLMBatch(Batch):
             top_n_tokens_tensor=top_n_tokens_tensor,
             blocks=blocks,
             max_blocks=max_blocks,
+            prefill_cache_indices=None,
         )
 
     def __del__(self):
@@ -636,11 +699,11 @@ class FlashCausalLM(Model):
         device: torch.device,
         rank: int = 0,
         world_size: int = 1,
-        sliding_window: Optional[int] = None,
     ):
         self.num_layers = num_layers
         self.num_kv_heads = num_kv_heads
         self.head_size = head_size
+        set_sliding_window_from_env()
 
         super(FlashCausalLM, self).__init__(
             model=model,
@@ -650,7 +713,6 @@ class FlashCausalLM(Model):
             device=device,
             rank=rank,
             world_size=world_size,
-            sliding_window=sliding_window,
         )
 
     @property
@@ -658,6 +720,8 @@ class FlashCausalLM(Model):
         return FlashCausalLMBatch
 
     def warmup(self, batch: FlashCausalLMBatch):
+        sliding_window = get_sliding_window()
+
         torch.cuda.empty_cache()
         try:
             cache_manager = set_cache_manager(
@@ -665,7 +729,8 @@ class FlashCausalLM(Model):
                 self.num_layers,
                 self.num_kv_heads,
                 self.head_size,
-                self.sliding_window is not None,
+                sliding_window.attention_sinks if sliding_window is not None else 0,
+                True if sliding_window is not None else False,
                 self.dtype,
                 self.device,
             )
@@ -705,7 +770,8 @@ class FlashCausalLM(Model):
             self.num_layers,
             self.num_kv_heads,
             self.head_size,
-            self.sliding_window is not None,
+            sliding_window.attention_sinks if sliding_window is not None else 0,
+            True if sliding_window is not None else False,
             self.dtype,
             self.device,
         )
@@ -713,8 +779,10 @@ class FlashCausalLM(Model):
         return int(num_blocks * BLOCK_SIZE)
 
     def forward(self, batch: FlashCausalLMBatch) -> Tuple[torch.Tensor, torch.Tensor]:
+        sliding_window = get_sliding_window()
+
         # Model Forward
-        return self.model.forward(
+        logits = self.model.forward(
             input_ids=batch.input_ids,
             position_ids=batch.position_ids,
             cu_seqlen_prefill=batch.cu_seqlen_prefill,
@@ -723,8 +791,13 @@ class FlashCausalLM(Model):
             slots=batch.slots[batch.slot_indices],
             input_lengths=batch.input_lengths_tensor,
             max_s=batch.max_seqlen,
+            prefill_cache_indices=batch.prefill_cache_indices,
+            sliding_window=sliding_window.size if sliding_window is not None else -1,
             lm_head_indices=batch.prefill_head_indices,
         )
+        if batch.prefill_cache_indices is not None:
+            batch.prefill_cache_indices = None
+        return logits
 
     @tracer.start_as_current_span("generate_token")
     def generate_token(
diff --git a/server/text_generation_server/models/flash_mistral.py b/server/text_generation_server/models/flash_mistral.py
index 919e4625..0c65be40 100644
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@@ -1,21 +1,14 @@
-import math
 import torch
 import torch.distributed
 
-import numpy as np
-
-from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import PreTrainedTokenizerBase
 from transformers.models.llama import LlamaTokenizerFast
-from typing import Optional, Tuple, Type
+from typing import Optional
 
-from text_generation_server.pb import generate_pb2
 from text_generation_server.models import FlashCausalLM
-from text_generation_server.models.flash_causal_lm import FlashCausalLMBatch, BLOCK_SIZE
-from text_generation_server.models.cache_manager import (
-    get_cache_manager,
-    set_cache_manager,
+from text_generation_server.models.sliding_window import (
+    set_sliding_window,
+    get_sliding_window,
 )
 from text_generation_server.models.custom_modeling.flash_mistral_modeling import (
     FlashMistralForCausalLM,
@@ -25,255 +18,10 @@ from text_generation_server.utils import (
     initialize_torch_distributed,
     weight_files,
     Weights,
-    HeterogeneousNextTokenChooser,
-    StoppingCriteria,
 )
 
 tracer = trace.get_tracer(__name__)
 
-# Will be set in init
-SLIDING_WINDOW: Optional[int] = None
-SLIDING_WINDOW_BLOCKS: Optional[int] = None
-
-
-# Adds windowing logic to FlashCausalLMBatch
-@dataclass
-class FlashMistralBatch(FlashCausalLMBatch):
-    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
-    # as we only keep SLIDING_WINDOW values instead of the whole tensor
-    prefill_cache_indices: Optional[torch.Tensor] = None
-
-    @classmethod
-    def from_pb(
-        cls,
-        pb: generate_pb2.Batch,
-        tokenizer: PreTrainedTokenizerBase,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> "FlashCausalLMBatch":
-        global SLIDING_WINDOW
-        global SLIDING_WINDOW_BLOCKS
-
-        batch_inputs = []
-        max_truncation = 0
-        for r in pb.requests:
-            batch_inputs.append(r.inputs)
-            max_truncation = max(max_truncation, r.truncate)
-
-        batch_tokenized_inputs = tokenizer(
-            batch_inputs, truncation=True, max_length=max_truncation
-        )["input_ids"]
-
-        position_ids = []
-        cu_seqlen_prefill = [0]
-        needed_blocks_slots = []
-        start_slots = []
-        slot_indices = []
-        prefill_cache_indices = []
-
-        input_lengths = []
-        prefix_offsets = []
-        read_offsets = []
-        all_input_ids = []
-        requests_idx_mapping = {}
-
-        all_prefill_logprobs = True
-        no_prefill_logprobs = True
-        prefill_head_indices = []
-        prefill_next_token_indices = []
-        prefill_cu_outlens = [0]
-
-        next_token_chooser_parameters = []
-        stopping_criterias = []
-        top_n_tokens = []
-
-        # Cumulative length
-        cumulative_length = 0
-        cumulative_max_length = 0
-        prefill_out_cumulative_length = 0
-
-        blocks = 0
-        max_seqlen = 0
-        max_length = 0
-        max_blocks = 0
-
-        # Parse batch
-        for i, (r, tokenized_input) in enumerate(
-            zip(pb.requests, batch_tokenized_inputs)
-        ):
-            # request id -> idx in list mapping
-            requests_idx_mapping[r.id] = i
-
-            tokenized_input = tokenized_input[-r.truncate :]
-
-            input_length = len(tokenized_input)
-            input_lengths.append(input_length)
-
-            prefix_offsets.append(input_length - 5)
-            read_offsets.append(input_length)
-
-            all_input_ids.append(tokenized_input)
-
-            # Position ids
-            request_position_ids = torch.arange(0, input_length, dtype=torch.int32)
-            position_ids.append(request_position_ids)
-
-            # Add cumulative lengths of all previous inputs
-            cu_seqlen_prefill.append(cumulative_length + input_length)
-
-            next_token_chooser_parameters.append(r.parameters)
-
-            stopping_criteria = StoppingCriteria.from_pb(
-                r.stopping_parameters, tokenizer
-            )
-            max_new_tokens = stopping_criteria.max_new_tokens
-            stopping_criterias.append(stopping_criteria)
-            top_n_tokens.append(r.top_n_tokens)
-
-            # Paged attention
-            # Remove one as the first token des not have a past
-            total_tokens = input_length + max_new_tokens - 1
-
-            # Needed blocks can not go over SLIDING_WINDOW_BLOCKS
-            needed_blocks = min(
-                math.ceil(total_tokens / BLOCK_SIZE), SLIDING_WINDOW_BLOCKS
-            )
-            blocks += needed_blocks
-
-            needed_blocks_slots.append((needed_blocks, total_tokens))
-            start_slots.append(cumulative_max_length)
-
-            request_slot_indices = torch.arange(
-                cumulative_max_length,
-                cumulative_max_length + input_length,
-                dtype=torch.int64,
-            )
-            slot_indices.append(request_slot_indices)
-
-            # Create tensor to slice into the kv tensor in prefill
-            request_prefill_cache_indices = torch.arange(
-                cumulative_length + max(0, input_length - SLIDING_WINDOW),
-                cumulative_length + input_length,
-                dtype=torch.int64,
-            )
-            prefill_cache_indices.append(request_prefill_cache_indices)
-
-            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
-            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
-
-            if r.prefill_logprobs:
-                prefill_head_indices.append(request_position_ids + cumulative_length)
-                prefill_next_token_indices.append(
-                    prefill_out_cumulative_length + input_length - 1
-                )
-                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
-                prefill_out_cumulative_length += input_length
-            else:
-                prefill_head_indices.append(
-                    torch.tensor(
-                        [cumulative_length + input_length - 1], dtype=torch.int32
-                    )
-                )
-                prefill_next_token_indices.append(prefill_out_cumulative_length)
-                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
-                prefill_out_cumulative_length += 1
-
-            # Update
-            cumulative_length += input_length
-            cumulative_max_length += total_tokens
-            max_seqlen = max(max_seqlen, input_length)
-            max_blocks = max(max_blocks, needed_blocks)
-            max_length = max(max_length, input_length + max_new_tokens)
-
-        next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
-            next_token_chooser_parameters, dtype, device
-        )
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
-
-        # Padded all_input_ids_tensor
-        all_input_ids_tensor = np.zeros(
-            (len(all_input_ids), max_length), dtype=np.int64
-        )
-        for i, input_ids in enumerate(all_input_ids):
-            all_input_ids_tensor[i, : len(input_ids)] = input_ids
-
-        # Create tensors on device
-        all_input_ids_tensor = torch.tensor(
-            all_input_ids_tensor, dtype=torch.int64, device=device
-        )
-
-        if len(pb.requests) > 1:
-            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
-            position_ids = torch.cat(position_ids)
-            slot_indices = torch.cat(slot_indices)
-            prefill_cache_indices = torch.cat(prefill_cache_indices)
-        else:
-            input_ids = all_input_ids[0]
-            position_ids = position_ids[0]
-            slot_indices = slot_indices[0]
-            prefill_cache_indices = prefill_cache_indices[0]
-
-        cu_seqlen_prefill = torch.tensor(
-            cu_seqlen_prefill, device=device, dtype=torch.int32
-        )
-
-        position_ids = position_ids.to(device)
-        slot_indices = slot_indices.to(device)
-        prefill_cache_indices = prefill_cache_indices.to(device)
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
-        input_lengths_tensor = torch.tensor(
-            input_lengths, dtype=torch.int32, device=device
-        )
-
-        if all_prefill_logprobs:
-            prefill_head_indices = None
-            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
-        elif no_prefill_logprobs:
-            prefill_head_indices = cu_seqlen_prefill[1:] - 1
-            prefill_next_token_indices = None
-        else:
-            prefill_head_indices = torch.tensor(
-                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
-            )
-            prefill_next_token_indices = torch.tensor(
-                prefill_next_token_indices, dtype=torch.int64, device=device
-            )
-        top_n_tokens_tensor = torch.tensor(
-            top_n_tokens, device=device, dtype=torch.int64
-        )
-
-        return cls(
-            batch_id=pb.id,
-            requests=pb.requests,
-            requests_idx_mapping=requests_idx_mapping,
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            start_slots=start_slots,
-            slot_indices=slot_indices,
-            needed_blocks_slots=needed_blocks_slots,
-            block_tables=None,
-            block_tables_tensor=None,
-            slots=None,
-            max_seqlen=max_seqlen,
-            prefill_head_indices=prefill_head_indices,
-            prefill_next_token_indices=prefill_next_token_indices,
-            prefill_cu_outlens=prefill_cu_outlens,
-            input_lengths=input_lengths,
-            input_lengths_tensor=input_lengths_tensor,
-            prefix_offsets=prefix_offsets,
-            read_offsets=read_offsets,
-            all_input_ids=all_input_ids,
-            all_input_ids_tensor=all_input_ids_tensor,
-            next_token_chooser=next_token_chooser,
-            stopping_criterias=stopping_criterias,
-            top_n_tokens=top_n_tokens,
-            top_n_tokens_tensor=top_n_tokens_tensor,
-            blocks=blocks,
-            max_blocks=max_blocks,
-            prefill_cache_indices=prefill_cache_indices,
-        )
-
 
 class FlashMistral(FlashCausalLM):
     def __init__(
@@ -284,9 +32,6 @@ class FlashMistral(FlashCausalLM):
         dtype: Optional[torch.dtype] = None,
         trust_remote_code: bool = False,
     ):
-        global SLIDING_WINDOW
-        global SLIDING_WINDOW_BLOCKS
-
         self.process_group, rank, world_size = initialize_torch_distributed()
         if torch.cuda.is_available():
             device = torch.device(f"cuda:{rank}")
@@ -308,8 +53,7 @@ class FlashMistral(FlashCausalLM):
         config.quantize = quantize
 
         # Set context windows
-        SLIDING_WINDOW = config.sliding_window
-        SLIDING_WINDOW_BLOCKS = math.ceil(config.sliding_window / BLOCK_SIZE)
+        set_sliding_window(config.sliding_window, 0)
 
         torch.distributed.barrier(group=self.process_group)
 
@@ -331,27 +75,4 @@ class FlashMistral(FlashCausalLM):
             device=device,
             rank=rank,
             world_size=world_size,
-            sliding_window=config.sliding_window,
         )
-
-    @property
-    def batch_type(self) -> Type[FlashMistralBatch]:
-        return FlashMistralBatch
-
-    def forward(self, batch: FlashMistralBatch) -> Tuple[torch.Tensor, torch.Tensor]:
-        # Model Forward
-        logits = self.model.forward(
-            input_ids=batch.input_ids,
-            position_ids=batch.position_ids,
-            cu_seqlen_prefill=batch.cu_seqlen_prefill,
-            kv_cache=get_cache_manager().kv_cache,
-            block_tables=batch.block_tables_tensor,
-            slots=batch.slots[batch.slot_indices],
-            input_lengths=batch.input_lengths_tensor,
-            max_s=batch.max_seqlen,
-            prefill_cache_indices=batch.prefill_cache_indices,
-            lm_head_indices=batch.prefill_head_indices,
-        )
-        if batch.prefill_cache_indices is not None:
-            batch.prefill_cache_indices = None
-        return logits
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 17d2ea9b..775d073c 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -3,10 +3,11 @@ import torch
 
 from abc import ABC, abstractmethod
 from typing import List, Tuple, Optional, TypeVar, Type
-from transformers import PreTrainedTokenizerBase, PretrainedConfig
+from transformers import PreTrainedTokenizerBase
 
 from text_generation_server.models.types import Batch, Generation
 from text_generation_server.pb.generate_pb2 import InfoResponse
+from text_generation_server.models.sliding_window import get_sliding_window
 
 B = TypeVar("B", bound=Batch)
 
@@ -21,7 +22,6 @@ class Model(ABC):
         device: torch.device,
         rank: int = 0,
         world_size: int = 1,
-        sliding_window: Optional[int] = None,
     ):
         self.model = model.eval()
         self.tokenizer = tokenizer
@@ -31,7 +31,6 @@ class Model(ABC):
         self.device = device
         self.rank = rank
         self.world_size = world_size
-        self.sliding_window = sliding_window
 
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
@@ -42,14 +41,15 @@ class Model(ABC):
 
     @property
     def info(self) -> InfoResponse:
-        if self.requires_padding and self.sliding_window is not None:
+        sliding_window = get_sliding_window()
+        if self.requires_padding and sliding_window is not None:
             raise NotImplementedError("sliding_window is not implemented with padding")
 
         return InfoResponse(
             requires_padding=self.requires_padding,
             dtype=str(self.dtype),
             device_type=self.device.type,
-            window_size=self.sliding_window,
+            window_size=sliding_window.size if sliding_window is not None else None,
         )
 
     @property
diff --git a/server/text_generation_server/models/sliding_window.py b/server/text_generation_server/models/sliding_window.py
new file mode 100644
index 00000000..2df39d8b
--- /dev/null
+++ b/server/text_generation_server/models/sliding_window.py
@@ -0,0 +1,41 @@
+import os
+import math
+
+from typing import Optional
+
+from text_generation_server.models.cache_manager import BLOCK_SIZE
+
+SLIDING_WINDOW: Optional["SlidingWindow"] = None
+
+
+class SlidingWindow:
+    def __init__(self, size: int, attention_sinks: int):
+        self.size = size
+        self.blocks = math.ceil(size / BLOCK_SIZE)
+        self.attention_sinks = attention_sinks
+
+    @classmethod
+    def from_env(cls) -> Optional["SlidingWindow"]:
+        sliding_window_env = os.getenv("SLIDING_WINDOW", None)
+        if sliding_window_env is not None:
+            return cls(int(sliding_window_env), int(os.getenv("ATTENTION_SINKS", 0)))
+        return None
+
+
+def set_sliding_window(size: int, attention_sinks: int) -> SlidingWindow:
+    global SLIDING_WINDOW
+    SLIDING_WINDOW = SlidingWindow(size, attention_sinks)
+    return SLIDING_WINDOW
+
+
+def set_sliding_window_from_env() -> Optional[SlidingWindow]:
+    global SLIDING_WINDOW
+    env_sliding_window = SlidingWindow.from_env()
+    if env_sliding_window is not None:
+        SLIDING_WINDOW = env_sliding_window
+    return SLIDING_WINDOW
+
+
+def get_sliding_window() -> Optional[SlidingWindow]:
+    global SLIDING_WINDOW
+    return SLIDING_WINDOW
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index f38f130e..22c7b73a 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -604,15 +604,16 @@ try:
                 elif rope_scaling["type"] == "yarn":
                     return YarnPositionRotaryEmbedding(
                         dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling["original_max_position_embeddings"],
+                        max_position_embeddings=rope_scaling[
+                            "original_max_position_embeddings"
+                        ],
                         base=10000.0,
                         device=inv_freq.device,
                         scaling_factor=scaling_factor,
                         extrapolation_factor=1,
                         attn_factor=1,
                         beta_fast=32,
-                        beta_slow=1
-
+                        beta_slow=1,
                     )
                 else:
                     raise NotImplementedError(
@@ -645,15 +646,16 @@ try:
                 elif rope_scaling["type"] == "yarn":
                     return YarnPositionRotaryEmbedding(
                         dim=2 * inv_freq.shape[0],
-                        max_position_embeddings=rope_scaling["original_max_position_embeddings"],
+                        max_position_embeddings=rope_scaling[
+                            "original_max_position_embeddings"
+                        ],
                         base=10000.0,
                         device=inv_freq.device,
                         scaling_factor=scaling_factor,
                         extrapolation_factor=1,
                         attn_factor=1,
                         beta_fast=32,
-                        beta_slow=1
-
+                        beta_slow=1,
                     )
                 else:
                     raise NotImplementedError(
@@ -734,19 +736,27 @@ try:
                 self._cos_cached = torch.cos(freqs).to(dtype)
                 self._sin_cached = torch.sin(freqs).to(dtype)
 
-
     # Inverse dim formula to find dim based on number of rotations
     import math
-    def find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048):
-        return (dim * math.log(max_position_embeddings/(num_rotations * 2 * math.pi)))/(2 * math.log(base))
+
+    def find_correction_dim(
+        num_rotations, dim, base=10000, max_position_embeddings=2048
+    ):
+        return (
+            dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))
+        ) / (2 * math.log(base))
 
     # Find dim range bounds based on rotations
-    def find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048):
-        low = math.floor(find_correction_dim(
-            low_rot, dim, base, max_position_embeddings))
-        high = math.ceil(find_correction_dim(
-            high_rot, dim, base, max_position_embeddings))
-        return max(low, 0), min(high, dim-1)  # Clamp values just in case
+    def find_correction_range(
+        low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+    ):
+        low = math.floor(
+            find_correction_dim(low_rot, dim, base, max_position_embeddings)
+        )
+        high = math.ceil(
+            find_correction_dim(high_rot, dim, base, max_position_embeddings)
+        )
+        return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
     def linear_ramp_mask(min, max, dim):
         if min == max:
@@ -762,7 +772,19 @@ try:
         return 0.1 * math.log(scale) + 1.0
 
     class YarnPositionRotaryEmbedding(PositionRotaryEmbedding):
-        def __init__(self, dim, max_position_embeddings, base, device, scaling_factor,*, extrapolation_factor, attn_factor, beta_fast, beta_slow):
+        def __init__(
+            self,
+            dim,
+            max_position_embeddings,
+            base,
+            device,
+            scaling_factor,
+            *,
+            extrapolation_factor,
+            attn_factor,
+            beta_fast,
+            beta_slow,
+        ):
             inv_freq = _create_inv_freq(dim, base, device)
             super().__init__(inv_freq, scaling_factor)
             self.dim = dim
@@ -772,7 +794,9 @@ try:
             self.attn_factor = attn_factor
             self.beta_fast = beta_fast
             self.beta_slow = beta_slow
-            self.mscale = float(get_mscale(self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation
+            self.mscale = float(
+                get_mscale(self.scaling_factor) * self.attn_factor
+            )  # Get n-d magnitude scaling corrected for interpolation
 
         def _update_cos_sin_cache(self, dtype, device, seqlen):
             # Reset the tables if the sequence length has changed,
@@ -788,13 +812,26 @@ try:
                     )
                     freqs = 1.0 / inv_freq_extrapolation
                     inv_freq_interpolation = 1.0 / (self.scaling_factor * freqs)
-                    low, high = find_correction_range(self.beta_fast, self.beta_slow, self.dim, self.base, self.max_position_embeddings)
-                    inv_freq_mask = (1 - linear_ramp_mask(low, high, self.dim // 2).float().to(device)) * self.extrapolation_factor # Get n-d rotational scaling corrected for extrapolation
-                    inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask
+                    low, high = find_correction_range(
+                        self.beta_fast,
+                        self.beta_slow,
+                        self.dim,
+                        self.base,
+                        self.max_position_embeddings,
+                    )
+                    inv_freq_mask = (
+                        1
+                        - linear_ramp_mask(low, high, self.dim // 2).float().to(device)
+                    ) * self.extrapolation_factor  # Get n-d rotational scaling corrected for extrapolation
+                    inv_freq = (
+                        inv_freq_interpolation * (1 - inv_freq_mask)
+                        + inv_freq_extrapolation * inv_freq_mask
+                    )
 
                     self.inv_freq = inv_freq
-                    self.mscale = float(get_mscale(self.scaling_factor) * self.attn_factor) # Get n-d magnitude scaling corrected for interpolation
-
+                    self.mscale = float(
+                        get_mscale(self.scaling_factor) * self.attn_factor
+                    )  # Get n-d magnitude scaling corrected for interpolation
 
                 self._seq_len_cached = seqlen
                 t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index 2f330d9c..9c91662f 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -16,7 +16,7 @@ class Weights:
         dtype,
         process_group,
         aliases: Optional[Dict[str, List[str]]] = None,
-        prefix: Optional[str] = None
+        prefix: Optional[str] = None,
     ):
         routing = {}
         for filename in filenames:
@@ -213,7 +213,8 @@ class Weights:
 
             bits, groupsize = self._get_gptq_params()
             from text_generation_server.utils.layers import HAS_EXLLAMA
-            use_exllama = bits==4  and HAS_EXLLAMA and quantize == "gptq"
+
+            use_exllama = bits == 4 and HAS_EXLLAMA and quantize == "gptq"
             weight = (qweight, qzeros, scales, g_idx, bits, groupsize, use_exllama)
         else:
             w = [self.get_sharded(f"{p}.weight", dim=0) for p in prefixes]
diff --git a/update_doc.py b/update_doc.py
index 6206e211..6127418c 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -21,14 +21,14 @@ def main():
     block = []
     for line in lines:
         if line.startswith("  -") or line.startswith("      -"):
-            rendered_block = '\n'.join(block)
+            rendered_block = "\n".join(block)
             if header:
                 final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
             else:
                 final_doc += f"```shell\n{rendered_block}\n```\n"
             block = []
             tokens = line.split("<")
-            if len(tokens)>1:
+            if len(tokens) > 1:
                 header = tokens[-1][:-1]
             else:
                 header = line.split("--")[-1]
@@ -36,7 +36,7 @@ def main():
 
         block.append(line)
 
-    rendered_block = '\n'.join(block)
+    rendered_block = "\n".join(block)
     final_doc += f"## {header}\n```shell\n{rendered_block}\n```\n"
     block = []