Flashinfer test.

2024-05-24 15:32:24 +00:00 · 2024-05-24 15:32:24 +00:00 · 3c74cf9cd4
parent 01e4442ef6
commit 3c74cf9cd4
4 changed files with 98 additions and 39 deletions
--- a/server/text_generation_server/models/cache_manager.py
+++ b/server/text_generation_server/models/cache_manager.py
@ -5,7 +5,8 @@ from typing import Optional, List, Tuple
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import FLASH_DECODING

-BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
+# BLOCK_SIZE: int = 256 if FLASH_DECODING else 16
+BLOCK_SIZE: int = 16
 # Will be set in warmup
 CACHE_MANAGER: Optional["CacheManager"] = None

@ -33,18 +34,21 @@ class CacheManager:

        if FLASH_DECODING:
            self.kv_cache = [
-                (
                torch.empty(
-                        (num_blocks, self.block_size, num_heads, head_size),
+                    (num_blocks, 2, self.block_size, num_heads, head_size),
                    dtype=dtype,
                    device=device,
-                    ),
-                    torch.empty(
-                        (num_blocks, self.block_size, num_heads, head_size),
-                        dtype=dtype,
-                        device=device,
-                    ),
                )
+                # torch.empty(
+                #     (num_blocks, self.block_size, num_heads, head_size),
+                #     dtype=dtype,
+                #     device=device,
+                # ),
+                # torch.empty(
+                #     (num_blocks, self.block_size, num_heads, head_size),
+                #     dtype=dtype,
+                #     device=device,
+                # ),
                for _ in range(num_layers)
            ]
        else:
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@ -137,6 +137,7 @@ class FlashLlamaAttention(torch.nn.Module):
        slots,
        input_lengths,
        max_s,
+        prefill_wrapper,
    ):
        qkv = self.query_key_value(hidden_states)
        query, kv = qkv.split(
@ -152,37 +153,40 @@ class FlashLlamaAttention(torch.nn.Module):
        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

        paged_attention.reshape_and_cache(
-            kv[:, 0], kv[:, 1], kv_cache[0], kv_cache[1], slots
+            kv[:, 0], kv[:, 1], kv_cache[:, 0], kv_cache[:, 1], slots
        )

        # output tensor
        attn_output = torch.empty_like(query)

-        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
-            flash_attn.attention(
-                query,
-                torch.select(kv, dim=1, index=0),
-                torch.select(kv, dim=1, index=1),
-                attn_output,
-                cu_seqlen_prefill,
-                max_s,
-                self.softmax_scale,
+            attn_output = prefill_wrapper.forward(
+                query.contiguous(), kv[:, 0].contiguous(), kv[:, 1].contiguous()
            )
+            # flash_attn.attention(
+            #     query,
+            #     torch.select(kv, dim=1, index=0),
+            #     torch.select(kv, dim=1, index=1),
+            #     attn_output,
+            #     cu_seqlen_prefill,
+            #     max_s,
+            #     self.softmax_scale,
+            # )
        # Decode
        else:
-            paged_attention.attention(
-                attn_output,
-                query,
-                kv_cache[0],
-                kv_cache[1],
-                self.kv_head_mapping,
-                self.softmax_scale,
-                block_tables,
-                input_lengths,
-                max_s,
-            )
+            attn_output = prefill_wrapper.forward(query, kv_cache)
+            # paged_attention.attention(
+            #     attn_output,
+            #     query,
+            #     kv_cache[0],
+            #     kv_cache[1],
+            #     self.kv_head_mapping,
+            #     self.softmax_scale,
+            #     block_tables,
+            #     input_lengths,
+            #     max_s,
+            # )

        return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))

@ -283,6 +287,7 @@ class FlashLlamaLayer(nn.Module):
        slots,
        input_lengths,
        max_s,
+        prefill_wrapper,
    ):
        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)

@ -297,6 +302,7 @@ class FlashLlamaLayer(nn.Module):
            slots,
            input_lengths,
            max_s,
+            prefill_wrapper,
        )

        # faster post attention rms norm
@ -362,6 +368,54 @@ class FlashLlamaModel(torch.nn.Module):
        cos, sin = self.layers[0].self_attn.rotary_emb.get_cos_sin(
            position_ids, max_s, hidden_states.dtype
        )
+        workspace_buffer = torch.empty(
+            16 * 1024 * 1024, dtype=torch.uint8, device=inputs_embeds.device
+        )
+        import flashinfer
+
+        if cu_seqlen_prefill is None:
+            prefill_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
+                workspace_buffer, "NHD"
+            )
+            cu_seqlen_q = torch.arange(
+                input_lengths.shape[0] + 1,
+                device=inputs_embeds.device,
+                dtype=torch.int32,
+            )
+            cu_seqlen_k = torch.cat(
+                [
+                    torch.zeros(
+                        (1,), device=input_lengths.device, dtype=input_lengths.dtype
+                    ),
+                    input_lengths.cumsum(dim=-1),
+                ]
+            ).to(dtype=torch.int32)
+
+            prefill_wrapper.begin_forward(
+                indptr=cu_seqlen_k,
+                indices=block_tables.view(-1),
+                last_page_len=slots.to(dtype=torch.int32),
+                num_qo_heads=self.layers[0].self_attn.num_heads,
+                num_kv_heads=self.layers[0].self_attn.num_key_value_heads,
+                head_dim=self.layers[0].self_attn.head_size,
+                page_size=16,
+                pos_encoding_mode="NONE",
+                data_type=inputs_embeds.dtype,
+            )
+        else:
+            prefill_wrapper = flashinfer.BatchPrefillWithRaggedKVCacheWrapper(
+                workspace_buffer, "NHD"
+            )
+            cu_seqlen_q = cu_seqlen_prefill
+            cu_seqlen_k = cu_seqlen_prefill
+
+            prefill_wrapper.begin_forward(
+                qo_indptr=cu_seqlen_q,
+                kv_indptr=cu_seqlen_k,
+                num_qo_heads=self.layers[0].self_attn.num_heads,
+                num_kv_heads=self.layers[0].self_attn.num_key_value_heads,
+                head_dim=self.layers[0].self_attn.head_size,
+            )

        residual = None
        for i, layer in enumerate(self.layers):
@ -376,8 +430,11 @@ class FlashLlamaModel(torch.nn.Module):
                slots,
                input_lengths,
                max_s,
+                prefill_wrapper,
            )

+        prefill_wrapper.end_forward()
+
        hidden_states, _ = self.norm(hidden_states, residual)

        return hidden_states
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -214,6 +214,8 @@ class MistralAttention(torch.nn.Module):

        self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)

+        attn_output = torch.empty_like(query)
+
        if prefill_cache_indices is not None:
            kv_to_cache = kv[prefill_cache_indices]
        else:
@ -222,10 +224,6 @@ class MistralAttention(torch.nn.Module):
        paged_attention.reshape_and_cache(
            kv_to_cache[:, 0], kv_to_cache[:, 1], kv_cache[0], kv_cache[1], slots
        )
-
-        # output tensor
-        attn_output = torch.empty_like(query)
-
        # Prefill
        if cu_seqlen_prefill is not None:
            # flash attention
--- a/server/text_generation_server/utils/paged_attention.py
+++ b/server/text_generation_server/utils/paged_attention.py
@ -30,8 +30,8 @@ def reshape_and_cache(
    else:
        if FLASH_DECODING:
            shape = key_cache.shape
-            key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-            value_cache.view(-1, shape[-2], shape[-1])[slots] = value
+            # key_cache.view(-1, shape[-2], shape[-1])[slots] = key
+            # value_cache.view(-1, shape[-2], shape[-1])[slots] = value
        else:
            cache_ops.reshape_and_cache(
                key, value, key_cache, value_cache, slots, "auto", 1.0