From 0759ec495e15a865d2a59befc2b796b5acc09b50 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 2 Jul 2024 14:26:47 +0200
Subject: [PATCH] Hotfixing qwen2 and starcoder2 (which also get clamping).
 (#2167)

---
 .../models/custom_modeling/flash_qwen2_modeling.py              | 2 +-
 .../models/custom_modeling/flash_starcoder2_modeling.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 2e281386..1cc6a613 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -368,7 +368,7 @@ class Qwen2ForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index df864bc1..a0273c37 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -534,7 +534,7 @@ class FlashStarcoder2ForCausalLM(torch.nn.Module):
         elif self.max_past is not None:
             # Clamp in decode mode as paged attention requires clamped values whereas the flash attention
             # kernel requires the true values
-            input_lengths = torch.clamp(input_lengths, max=self.max_past_tensor)
+            input_lengths = input_lengths.clamp(max=self.max_past_tensor)
 
         hidden_states = self.model(
             input_ids,