Fixup residual, initial block attention config

2024-06-13 10:38:56 +02:00 · 2024-06-13 10:38:56 +02:00 · 5d2b93ba42
parent 4ed551abba
commit 5d2b93ba42
1 changed files with 43 additions and 13 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_phi3small_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi3small_modeling.py
@ -19,6 +19,7 @@
 # limitations under the License.
 from typing import List, Optional, Tuple
 from dataclasses import dataclass
 import torch
 import torch.distributed
@ -50,6 +51,14 @@ if SYSTEM == "rocm":
        raise ImportError(f"Could not load `vllm._custom_C`. Full error: {e}")
@dataclass
 class BlockSparseAttentionConfig:
    block_size: int
    homo_head_pattern: bool
    num_local_blocks: int
    vert_stride: int
 def load_attention(config, prefix, weights):
    # Only defined in granite.
    bias = getattr(config, "attention_bias", False)
@ -69,6 +78,7 @@ class FlashPhi3SmallAttention(torch.nn.Module):
        self,
        prefix: str,
        config,
        layer_id: int,
        weights,
    ):
        super().__init__()
@ -83,6 +93,9 @@ class FlashPhi3SmallAttention(torch.nn.Module):
            device=weights.device,
        )
        if hasattr(config, "mup_use_scaling") and config.mup_use_scaling:
            self.softmax_scale = self.head_size / config.mup_attn_multiplier
        else:
            self.softmax_scale = self.head_size**-0.5
        if self.num_heads % weights.process_group.size() != 0:
@ -102,11 +115,25 @@ class FlashPhi3SmallAttention(torch.nn.Module):
        self.query_key_value = load_attention(config, prefix, weights)
        is_dense = getattr(config, "dense_attention_every_n_layers", False) and (
            (layer_id + 1) % config.dense_attention_every_n_layers == 0
        )
        if is_dense:
            self.blocksparse_config = None
        else:
            self.blocksparse_config = BlockSparseAttentionConfig(
                block_size=config.blocksparse_block_size,
                homo_head_pattern=config.blocksparse_homo_head_pattern,
                num_local_blocks=config.blocksparse_num_local_blocks,
                vert_stride=config.blocksparse_vert_stride,
            )
        self.o_proj = TensorParallelRowLinear.load(
            config,
            prefix=f"{prefix}.dense",
            weights=weights,
-            bias=False,
+            bias=True,
        )
        self.num_groups = self.num_heads // self.num_key_value_heads
@ -246,10 +273,13 @@ class Phi3SmallMLP(nn.Module):
 class FlashPhi3SmallLayer(nn.Module):
-    def __init__(self, prefix, config, weights):
+    def __init__(self, prefix, config, layer_id: int, weights):
        super().__init__()
        self.self_attn = FlashPhi3SmallAttention(
-            prefix=f"{prefix}.self_attn", config=config, weights=weights
+            prefix=f"{prefix}.self_attn",
            config=config,
            layer_id=layer_id,
            weights=weights,
        )
        self.mlp = Phi3SmallMLP(prefix=f"{prefix}.mlp", config=config, weights=weights)
@ -267,7 +297,6 @@ class FlashPhi3SmallLayer(nn.Module):
    def forward(
        self,
        hidden_states,
        residual,
        cos,
        sin,
        cu_seqlen_prefill,
@ -277,7 +306,8 @@ class FlashPhi3SmallLayer(nn.Module):
        input_lengths,
        max_s,
    ):
-        normed_hidden_states, res = self.input_layernorm(hidden_states, residual)
+        residual = hidden_states
        normed_hidden_states, res = self.input_layernorm(hidden_states, None)
        # Self Attention
        attn_output = self.self_attn(
@ -294,12 +324,13 @@ class FlashPhi3SmallLayer(nn.Module):
        # faster post attention rms norm
        normed_attn_res_output, attn_res = self.post_attention_layernorm(
-            attn_output, res
+            attn_output, residual
        )
        mlp_output = self.mlp(normed_attn_res_output)
        mlp_output = attn_res + mlp_output
-        return mlp_output, attn_res
+        return mlp_output
 class FlashPhi3SmallModel(torch.nn.Module):
@ -318,12 +349,13 @@ class FlashPhi3SmallModel(torch.nn.Module):
                        else f"{prefix}.model.layers.{layer_id}"
                    ),
                    config=config,
                    layer_id=layer_id,
                    weights=weights,
                )
                for layer_id in range(config.num_hidden_layers)
            ]
        )
-        self.norm = FastLayerNorm.load(
+        self.norm = nn.LayerNorm.load(
            prefix=(
                "model.final_layernorm"
                if not prefix
@ -360,11 +392,9 @@ class FlashPhi3SmallModel(torch.nn.Module):
            position_ids, max_s, hidden_states.dtype
        )
        residual = None
        for i, layer in enumerate(self.layers):
-            hidden_states, residual = layer(
+            hidden_states = layer(
                hidden_states,
                residual,
                cos,
                sin,
                cu_seqlen_prefill,
@ -375,7 +405,7 @@ class FlashPhi3SmallModel(torch.nn.Module):
                max_s,
            )
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states = self.norm(hidden_states)
        return hidden_states