From add4d42cb3aebf5a0e04ba117d6ba445ae604153 Mon Sep 17 00:00:00 2001
From: Felix Marty <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 2 Jul 2024 12:52:55 +0000
Subject: [PATCH] do not use tunableop for non flash-causal-lm modezls

---
 .github/workflows/build.yaml                  |  6 ++++++
 server/text_generation_server/models/mamba.py | 10 ++++++++++
 server/text_generation_server/models/model.py | 19 +++++++++++++++++++
 3 files changed, 35 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 18b3a09f..121917f0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -265,4 +265,10 @@ jobs:
           echo "DOCKER_VOLUME:"
           echo $DOCKER_VOLUME
 
+          # TunableOp warmup is rather slow, do it only for a few seqlens.
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            PYTORCH_TUNABLEOP_SEQLENS=2,4
+          fi
+
           pytest -s -vvvvv integration-tests ${PYTEST_FLAGS}
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
index 9189b45c..1ce8346c 100644
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -466,6 +466,16 @@ class Mamba(Model):
         return MambaBatch
 
     def warmup(self, batch) -> Optional[int]:
+        if SYSTEM == "rocm" and (
+            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+        ):
+            logger.info(
+                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
+            )
+            torch.cuda.tunable.tuning_enable(False)
+            torch.cuda.tunable.enable(False)
+
         # TODO: implement warmup for Mamba if needed
         if CUDA_GRAPHS:
             if self.speculate is None or self.speculate == 0:
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index c90fd38a..819414aa 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -15,6 +15,9 @@ from text_generation_server.utils.adapter import (
     AdapterParameters,
     AdapterSource,
 )
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import CUDA_GRAPHS
+import os
 from loguru import logger
 
 
@@ -100,7 +103,23 @@ class Model(ABC):
         raise NotImplementedError
 
     def warmup(self, batch: B) -> Optional[int]:
+        if SYSTEM == "rocm" and (
+            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+        ):
+            logger.info(
+                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
+            )
+            torch.cuda.tunable.tuning_enable(False)
+            torch.cuda.tunable.enable(False)
+
         self.generate_token(batch)
+
+        if CUDA_GRAPHS:
+            logger.info(
+                f"Got CUDA_GRAPHS={CUDA_GRAPHS} but cuda graphs are not supported for {self.model_id} (instance of {self.__class__.__name__}). Cuda graphs will not be used."
+            )
+
         return None
 
     def decode_token(