Fixing rocm. (#2021)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.
2024-06-05 14:41:34 +02:00 · 2024-06-05 14:41:34 +02:00 · 0a94fad79f
parent 8aece3bd68
commit 0a94fad79f
1 changed files with 26 additions and 95 deletions
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -126,40 +126,34 @@ if ENGINE != "triton":
        import flash_attn_2_cuda
        logger.info("ROCm: using Flash Attention 2 Composable Kernel implementation.")
-    except ImportError:
+    except ImportError as e:
-        try:
+        if major >= 8:
-            import flash_attn_cuda
+            architecture_suffix = f"-{SYSTEM}"
            raise ImportError(
                "Flash Attention V2 is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
            )
        elif is_sm75:
            raise ImportError(
                "Flash Attention is not installed.\n"
                "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                "or install flash attention with `cd server && make install install-flash-attention`"
            ) from e
        else:
-            ENGINE = "v1"
+            for idx in range(torch.cuda.device_count()):
-            logger.info("ROCm: using Flash Attention 1")
+                name = torch.cuda.get_device_name(idx)
-        except ImportError as e:
+                if "MI210" not in name and "MI250" not in name:
-            if major >= 8:
+                    raise ImportError(
-                architecture_suffix = f"-{SYSTEM}"
+                        f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
-                raise ImportError(
+                    )
-                    "Flash Attention V2 is not installed.\n"
+            raise ImportError(
-                    "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
+                f"AMD GPU with ROCm capability {major} {minor} is not supported"
-                    f"or install flash attention v2 with `cd server && make install install-flash-attention-v2{architecture_suffix}`"
+            ) from e
                )
            elif is_sm75:
                raise ImportError(
                    "Flash Attention is not installed.\n"
                    "Use the official Docker image (ghcr.io/huggingface/text-generation-inference:latest) "
                    "or install flash attention with `cd server && make install install-flash-attention`"
                ) from e
            else:
                for idx in range(torch.cuda.device_count()):
                    name = torch.cuda.get_device_name(idx)
                    if "MI210" not in name and "MI250" not in name:
                        raise ImportError(
                            f"AMD GPU {torch.cuda.get_device_name(idx)} does not support flash-attention"
                        )
                raise ImportError(
                    f"AMD GPU with ROCm capability {major} {minor} is not supported"
                ) from e
-SUPPORTS_WINDOWING = ENGINE != "v1"
+SUPPORTS_WINDOWING = False
 if ENGINE == "ck":
    def attention(
@ -186,17 +180,12 @@ if ENGINE == "ck":
            out,
            cu_seqlens,
            cu_seqlens,
            None,
            None,
            None,
            max_s,
            max_s,
            0.0,
            softmax_scale,
            False,
            causal,
            window_size_left,
            0,
            False,
            None,
        )
@ -234,62 +223,4 @@ elif ENGINE == "triton":
        return output
 else:
-
+    raise RuntimeError(f"Unknown attention engine {ENGINE}")
    def attention(
        q,
        k,
        v,
        out,
        cu_seqlens,
        max_s,
        softmax_scale,
        window_size_left=-1,
    ):
        if window_size_left != -1:
            raise NotImplementedError(
                "window_size_left is only available with flash attn v2"
            )
        # Flash attention v1 requires q, k and v to have the same number of heads
        if k.shape[1] != q.shape[1]:
            # MQA expand
            if k.shape[1] == 1:
                k = k.expand(-1, q.shape[1], -1)
            # Grouped attention reshape
            else:
                original_shape = k.shape
                k = (
                    k.unsqueeze(2)
                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
                    .reshape(original_shape[0], -1, original_shape[2])
                )
        if v.shape[1] != q.shape[1]:
            # MQA expand
            if v.shape[1] == 1:
                v = v.expand(-1, q.shape[1], -1)
            # Grouped attention reshape
            else:
                original_shape = v.shape
                v = (
                    v.unsqueeze(2)
                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
                    .reshape(original_shape[0], -1, original_shape[2])
                )
        return flash_attn_cuda.fwd(
            q,
            k,
            v,
            out,
            cu_seqlens,
            cu_seqlens,
            max_s,
            max_s,
            0.0,
            softmax_scale,
            False,
            True,
            False,
            0,
            None,
        )