feat(server): reduce mlp and attn in one op for flash neox (#145)

2023-03-28 16:51:41 +02:00 · 2023-03-28 16:51:41 +02:00 · c9bdaa8b73
parent f000068944
commit c9bdaa8b73
1 changed files with 102 additions and 104 deletions
--- a/server/text_generation_server/models/flash_neox_modeling.py
+++ b/server/text_generation_server/models/flash_neox_modeling.py
@ -1,3 +1,23 @@
 # coding=utf-8
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
 import torch.distributed
@ -16,6 +36,42 @@ import dropout_layer_norm
 from flash_attn.layers.rotary import RotaryEmbedding
 class FastLayerNorm(nn.LayerNorm):
    def forward(self, hidden_states, residual=None):
        if hidden_states.shape[-1] > 6144:
            if residual is not None:
                hidden_states += residual
            residual = hidden_states
            return super(FastLayerNorm, self).forward(hidden_states), residual
        else:
            (
                normed_hidden_states,
                residual,
                *rest,
            ) = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.weight,
                self.bias,
                None,
                None,
                None,
                None,
                0.0,
                self.eps,
                1.0,
                0,
                None,
                False,
                False,
            )
            if residual is None:
                residual = hidden_states
            return normed_hidden_states, residual
 class FastLinear(nn.Linear):
    def __init__(
        self,
@ -59,9 +115,6 @@ class TensorParallelColumnLinear(FastLinear):
            dtype=dtype,
        )
    def forward(self, input):
        return super(TensorParallelColumnLinear, self).forward(input)
 class TensorParallelRowLinear(FastLinear):
    def __init__(
@ -69,12 +122,14 @@ class TensorParallelRowLinear(FastLinear):
        in_features,
        out_features,
        process_group: torch.distributed.ProcessGroup,
        reduce=True,
        bias=True,
        device=None,
        dtype=None,
    ):
        self.process_group = process_group
        self.tp_world_size = process_group.size()
        self.reduce = reduce
        assert in_features % self.tp_world_size == 0
        in_features = in_features // self.tp_world_size
@ -88,7 +143,8 @@ class TensorParallelRowLinear(FastLinear):
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        out = super(TensorParallelRowLinear, self).forward(input)
-        torch.distributed.all_reduce(out, group=self.process_group)
+        if self.reduce:
            torch.distributed.all_reduce(out, group=self.process_group)
        return out
@ -196,7 +252,13 @@ class PositionRotaryEmbedding(RotaryEmbedding):
 class FlashNeoxAttention(torch.nn.Module):
    def __init__(
-        self, num_heads, hidden_size, rotary_pct, rotary_emb_base, process_group=None
+        self,
        num_heads,
        hidden_size,
        rotary_pct,
        rotary_emb_base,
        process_group=None,
        reduce=True,
    ):
        super().__init__()
        self.num_heads = num_heads
@ -218,9 +280,7 @@ class FlashNeoxAttention(torch.nn.Module):
                process_group=process_group,
            )
            self.dense = TensorParallelRowLinear(
-                hidden_size,
+                hidden_size, hidden_size, process_group=process_group, reduce=reduce
                hidden_size,
                process_group=process_group,
            )
    def shuffle_qkv_dims(self):
@ -309,7 +369,9 @@ class FlashNeoxAttention(torch.nn.Module):
 class FlashMLP(nn.Module):
-    def __init__(self, act, hidden_size, intermediate_size, process_group=None):
+    def __init__(
        self, act, hidden_size, intermediate_size, process_group=None, reduce=True
    ):
        super().__init__()
        self.act = (
            ACT2FN[act]
@ -330,6 +392,7 @@ class FlashMLP(nn.Module):
                intermediate_size,
                hidden_size,
                process_group=process_group,
                reduce=reduce,
            )
        self.process_group = process_group
@ -355,12 +418,24 @@ class FlashNeoXLayer(nn.Module):
    ):
        super().__init__()
        self.use_parallel_residual = use_parallel_residual
-        self.input_layernorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.input_layernorm = FastLayerNorm(hidden_size, eps=layer_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.post_attention_layernorm = FastLayerNorm(hidden_size, eps=layer_norm_eps)
        self.attention = FlashNeoxAttention(
-            num_heads, hidden_size, rotary_pct, rotary_emb_base, process_group
+            num_heads,
            hidden_size,
            rotary_pct,
            rotary_emb_base,
            process_group,
            reduce=not use_parallel_residual,
        )
-        self.mlp = FlashMLP(act, hidden_size, intermediate_size, process_group)
+        self.mlp = FlashMLP(
            act,
            hidden_size,
            intermediate_size,
            process_group,
            reduce=not use_parallel_residual,
        )
        self.process_group = process_group
    def forward(
        self,
@ -375,24 +450,7 @@ class FlashNeoXLayer(nn.Module):
        cu_seqlens_q,
    ):
        if self.use_parallel_residual:
-            # faster input layer norm
+            ln1_hidden_states, _ = self.input_layernorm(hidden_states)
            ln1_hidden_states, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                None,
                self.input_layernorm.weight,
                self.input_layernorm.bias,
                None,
                None,
                None,
                None,
                0.0,
                self.input_layernorm.eps,
                1.0,
                0,
                None,
                False,
                False,
            )
            attn_output = self.attention(
                ln1_hidden_states,
@ -405,46 +463,18 @@ class FlashNeoXLayer(nn.Module):
                cu_seqlens_q,
            )
-            # faster post attention layer norm
+            ln2_hidden_states, _ = self.post_attention_layernorm(hidden_states)
            ln2_hidden_states, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                None,
                self.post_attention_layernorm.weight,
                self.post_attention_layernorm.bias,
                None,
                None,
                None,
                None,
                0.0,
                self.post_attention_layernorm.eps,
                1.0,
                0,
                None,
                False,
                False,
            )
            mlp_output = self.mlp(ln2_hidden_states)
-            return mlp_output + attn_output + hidden_states, None
+            intermediate = mlp_output + attn_output
            # Only reduce once and after the addition instead of once per layer
            if self.process_group is not None:
                torch.distributed.all_reduce(intermediate, group=self.process_group)
            return intermediate + hidden_states, None
        else:
-            # faster input layer norm
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
            hidden_states, residual, *rest = dropout_layer_norm.dropout_add_ln_fwd(
                hidden_states,
                residual,
                self.input_layernorm.weight,
                self.input_layernorm.bias,
                None,
                None,
                None,
                None,
                0.0,
                self.input_layernorm.eps,
                1.0,
                0,
                None,
                False,
                False,
            )
            hidden_states = self.attention(
                hidden_states,
@ -457,23 +487,8 @@ class FlashNeoXLayer(nn.Module):
                cu_seqlens_q,
            )
-            # faster post attention layer norm
+            hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual, *rest = dropout_layer_norm.dropout_add_ln_fwd(
+                hidden_states, residual
                hidden_states,
                residual,
                self.post_attention_layernorm.weight,
                self.post_attention_layernorm.bias,
                None,
                None,
                None,
                None,
                0.0,
                self.post_attention_layernorm.eps,
                1.0,
                0,
                None,
                False,
                False,
            )
            mlp_output = self.mlp(hidden_states)
@ -523,7 +538,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                for _ in range(config.num_hidden_layers)
            ]
        )
-        self.final_layer_norm = nn.LayerNorm(
+        self.final_layer_norm = FastLayerNorm(
            config.hidden_size, eps=config.layer_norm_eps
        )
@ -603,24 +618,7 @@ class FlashGPTNeoXModel(FlashGPTNeoXPreTrainedModel):
                cu_seqlens_q,
            )
-        # Faster final layer norm
+        hidden_states, _ = self.final_layer_norm(hidden_states, residual)
        hidden_states, *rest = dropout_layer_norm.dropout_add_ln_fwd(
            hidden_states,
            residual,
            self.final_layer_norm.weight,
            self.final_layer_norm.bias,
            None,
            None,
            None,
            None,
            0.0,
            self.final_layer_norm.eps,
            1.0,
            0,
            None,
            False,
            False,
        )
        return hidden_states, past_key_values