Break cycle between the attention implementations and KV cache (#2627)

2024-10-17 14:54:22 +02:00 · 2024-10-17 14:54:22 +02:00 · 8ec57558cd
parent 5f32dea1e2
commit 8ec57558cd
5 changed files with 37 additions and 68 deletions
--- a/server/text_generation_server/layers/attention/init.py
+++ b/server/text_generation_server/layers/attention/init.py
@ -11,21 +11,18 @@ if SYSTEM == "cuda":
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
        reshape_and_cache,
    )
 elif SYSTEM == "rocm":
    from .rocm import (
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
        reshape_and_cache,
    )
 elif SYSTEM == "ipex":
    from .ipex import (
        SUPPORTS_WINDOWING,
        attention,
        paged_attention,
        reshape_and_cache,
    )
 else:
    raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
@ -36,7 +33,6 @@ from .kv_cache import KVCache
 __all__ = [
    "attention",
    "paged_attention",
    "reshape_and_cache",
    "SUPPORTS_WINDOWING",
    "KVCache",
    "Seqlen",
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@ -12,30 +12,6 @@ major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
 _PARTITION_SIZE = 512
 try:
    from vllm._C import cache_ops
 except Exception as e:
    raise ImportError(
        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
    )
 def reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
    if ATTENTION in {"flashdecoding", "flashinfer"}:
        shape = key_cache.shape
        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
    else:
        cache_ops.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
 def paged_attention(
    query: torch.Tensor,
@ -346,5 +322,4 @@ __all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
    "reshape_and_cache",
 ]
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@ -47,18 +47,6 @@ def attention(
    return out
 def reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
    ipex.llm.modules.PagedAttention.reshape_and_cache(
        key, value, key_cache, value_cache, slots
    )
 def paged_attention(
    query: torch.Tensor,
    kv_cache: KVCache,
@ -94,5 +82,4 @@ __all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
    "reshape_and_cache",
 ]
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@ -115,6 +115,41 @@ class KVCache:
            key_cache.view(-1, shape[-2], shape[-1])[slots] = key
            value_cache.view(-1, shape[-2], shape[-1])[slots] = value
        else:
-            from text_generation_server.layers.attention import reshape_and_cache
+            paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
-            reshape_and_cache(key, value, key_cache, value_cache, slots)
+
 def paged_reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
    if SYSTEM == "cuda":
        try:
            from vllm._C import cache_ops
        except Exception as e:
            raise ImportError(
                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
            )
        cache_ops.reshape_and_cache(
            key, value, key_cache, value_cache, slots, "auto", 1.0
        )
    elif SYSTEM == "rocm":
        try:
            import vllm._custom_ops as ops
        except Exception as e:
            raise ImportError(
                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
            )
        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
    elif SYSTEM == "ipex":
        import intel_extension_for_pytorch as ipex
        ipex.llm.modules.PagedAttention.reshape_and_cache(
            key, value, key_cache, value_cache, slots
        )
    else:
        raise NotImplementedError(
            f"Cannot reshape and cache for paged attention, system '{SYSTEM}' not supportedattention"
        )
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@ -3,7 +3,6 @@ from typing import Optional
 import torch
 from text_generation_server.layers.attention.kv_cache import KVCache
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import ATTENTION
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.utils.log import log_master
 from loguru import logger
@ -28,28 +27,6 @@ except ImportError as e:
    )
    use_rocm_custom_paged_attn = False
 try:
    import vllm._custom_ops as ops
 except Exception as e:
    raise ImportError(
        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
    )
 def reshape_and_cache(
    key: torch.Tensor,
    value: torch.Tensor,
    key_cache: torch.Tensor,
    value_cache: torch.Tensor,
    slots: torch.Tensor,
 ):
    if ATTENTION == "flashdecoding":
        shape = key_cache.shape
        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
    else:
        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
 def paged_attention(
    query: torch.Tensor,
@ -305,5 +282,4 @@ __all__ = [
    "SUPPORTS_WINDOWING",
    "attention",
    "paged_attention",
    "reshape_and_cache",
 ]