do not use tunableop for non flash-causal-lm modezls

2024-07-02 12:52:55 +00:00 · 2024-07-02 12:52:55 +00:00 · add4d42cb3
parent c2f4b7f93e
commit add4d42cb3
3 changed files with 35 additions and 0 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -265,4 +265,10 @@ jobs:
          echo "DOCKER_VOLUME:"
          echo $DOCKER_VOLUME

+          # TunableOp warmup is rather slow, do it only for a few seqlens.
+          if [[ ${{ inputs.hardware }} == "rocm" ]]
+          then
+            PYTORCH_TUNABLEOP_SEQLENS=2,4
+          fi
+
          pytest -s -vvvvv integration-tests ${PYTEST_FLAGS}
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@ -466,6 +466,16 @@ class Mamba(Model):
        return MambaBatch

    def warmup(self, batch) -> Optional[int]:
+        if SYSTEM == "rocm" and (
+            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+        ):
+            logger.info(
+                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
+            )
+            torch.cuda.tunable.tuning_enable(False)
+            torch.cuda.tunable.enable(False)
+
        # TODO: implement warmup for Mamba if needed
        if CUDA_GRAPHS:
            if self.speculate is None or self.speculate == 0:
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@ -15,6 +15,9 @@ from text_generation_server.utils.adapter import (
    AdapterParameters,
    AdapterSource,
 )
+from text_generation_server.utils.import_utils import SYSTEM
+from text_generation_server.models.globals import CUDA_GRAPHS
+import os
 from loguru import logger


@ -100,7 +103,23 @@ class Model(ABC):
        raise NotImplementedError

    def warmup(self, batch: B) -> Optional[int]:
+        if SYSTEM == "rocm" and (
+            os.environ.get("PYTORCH_TUNABLEOP_ENABLED") is None
+            or os.environ.get("PYTORCH_TUNABLEOP_ENABLED") == "1"
+        ):
+            logger.info(
+                f"ROCm: Got PYTORCH_TUNABLEOP_ENABLED=1 but TunableOp is not supported for {self.model_id} (instance of {self.__class__.__name__}). Disabling TunableOp."
+            )
+            torch.cuda.tunable.tuning_enable(False)
+            torch.cuda.tunable.enable(False)
+
        self.generate_token(batch)
+
+        if CUDA_GRAPHS:
+            logger.info(
+                f"Got CUDA_GRAPHS={CUDA_GRAPHS} but cuda graphs are not supported for {self.model_id} (instance of {self.__class__.__name__}). Cuda graphs will not be used."
+            )
+
        return None

    def decode_token(