Add sealion mpt support (#1477)

# What does this PR do?   Fixes # (issue) ## Before submitting - [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests), Pull Request section? - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. - [ ] Did you make sure to update the documentation with your changes? Here are the [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). - [ ] Did you write any new necessary tests? ## Who can review? Anyone in the community is free to review the PR once the tests have passed. Feel free to tag members/contributors who may be interested in your PR.  --------- Co-authored-by: Choon Meng Tan <choonmeng@aisingapore.org> Co-authored-by: David Ong Tat-Wee <13075447+ongtw@users.noreply.github.com>
2024-01-26 14:05:02 +01:00 · 2024-01-26 14:05:02 +01:00 · ac49972752
parent b95732180d
commit ac49972752
1 changed files with 63 additions and 18 deletions
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
@ -28,7 +28,6 @@ EPS = 1e-5


 def load_col(config, prefix, weights, bias):
-    assert bias == False, NotImplementedError
    assert config.quantize != "gptq", NotImplementedError
    slice_ = weights._get_slice(f"{prefix}.weight")
    rank = weights.process_group.rank()
@ -45,7 +44,26 @@ def load_col(config, prefix, weights, bias):
    if weight.dtype != torch.int32:
        weight = weight.to(dtype=weights.dtype)
    weight = weight.to(device=weights.device)
-    bias = None
+
+    if bias:
+        bias_slice_ = weights._get_slice(f"{prefix}.bias")
+        bias_rank = weights.process_group.rank()
+        bias_size = weights.process_group.size()
+
+        bias_h = bias_slice_.get_shape()
+        bias_h = bias_h[0]
+        bias_block_size = bias_h // bias_size
+
+        bias_q_part = bias_slice_[bias_rank * bias_block_size : (bias_rank + 1) * bias_block_size]
+        bias_k_part = bias_slice_[bias_h + bias_rank * bias_block_size : bias_h + (bias_rank + 1) * bias_block_size]
+        bias_v_part = bias_slice_[2 * bias_h + bias_rank * bias_block_size : 2 * bias_h + (bias_rank + 1) * bias_block_size]
+
+        bias = torch.cat([bias_q_part, bias_k_part, bias_v_part], dim=0)
+        if bias.dtype != torch.int32:
+            bias = bias.to(dtype=weights.dtype)
+        bias = bias.to(device=weights.device)
+    else:
+        bias = None
    linear = get_linear(weight, bias, config.quantize)
    return TensorParallelColumnLinear(linear)

@ -330,7 +348,12 @@ class MultiheadAttention(nn.Module):
            config, prefix=f"{prefix}.Wqkv", weights=weights, bias=not config.no_bias
        )
        if self.qk_ln:
-            raise NotImplementedError("qk_ln is not supported")
+            bias = not config.no_bias
+            hidden_size = config.d_model
+            head_dim = hidden_size // self.n_heads
+
+            self.q_ln = LPLayerNorm(d_model, bias=bias, prefix=f"{prefix}.q_ln", weights=weights)
+            self.k_ln = LPLayerNorm(self.n_heads * head_dim, prefix=f"{prefix}.k_ln", weights=weights)
        if self.attn_impl == "flash":
            self.attn_fn = flash_attn_fn
        elif self.attn_impl == "triton":
@ -581,12 +604,20 @@ class MPTBlock(nn.Module):
                f"""Not implemented attn {config.attn_config["attn_type"]}"""
            )
        resid_pdrop = config.resid_pdrop
-        self.norm_1 = nn.LayerNorm.load_no_bias(
-            prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
-        )
-        self.norm_2 = nn.LayerNorm.load_no_bias(
-            prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
-        )
+        if config.no_bias:
+            self.norm_1 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load_no_bias(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_1 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_1", weights=weights, eps=EPS
+            )
+            self.norm_2 = nn.LayerNorm.load(
+                prefix=f"{prefix}.norm_2", weights=weights, eps=EPS
+            )
        self.attn = MultiheadAttention(config, prefix=f"{prefix}.attn", weights=weights)
        self.ffn = MPTMLP(config, prefix=f"{prefix}.ffn", weights=weights)
        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
@ -635,6 +666,9 @@ class LPLayerNorm(torch.nn.LayerNorm):
        elementwise_affine=True,
        device=None,
        dtype=None,
+        bias: Optional[bool] = True,
+        prefix=None,
+        weights=None,
    ):
        super().__init__(
            normalized_shape=normalized_shape,
@ -642,7 +676,14 @@ class LPLayerNorm(torch.nn.LayerNorm):
            elementwise_affine=elementwise_affine,
            device=device,
            dtype=dtype,
+            bias=bias,
        )
+        if weights is not None:
+            self.weight = nn.Parameter(weights.get_sharded(f"{prefix}.weight", dim=0))
+            if bias:
+                self.bias = nn.Parameter(weights.get_sharded(f"{prefix}.bias", dim=0))
+            self.normalized_shape = self.weight.shape
+

    def forward(self, x):
        module_device = x.device
@ -755,20 +796,23 @@ class MPTModel(MPTPreTrainedModel):
            )

        self.wte = TensorParallelEmbedding("transformer.wte", weights)
+
        if not self.alibi:
-            # self.wpe = torch.nn.Embedding(
-            #     config.max_seq_len, config.d_model, device=config.init_device
-            # )
-            raise RuntimeError("no alibi no supported")
+           self.wpe = TensorParallelEmbedding("transformer.wpe", weights)
        self.blocks = nn.ModuleList(
            [
                MPTBlock(config, prefix=f"transformer.blocks.{i}", weights=weights)
                for i in range(config.n_layers)
            ]
        )
-        self.norm_f = nn.LayerNorm.load_no_bias(
-            prefix="transformer.norm_f", weights=weights, eps=EPS
-        )
+        if config.no_bias:
+            self.norm_f = nn.LayerNorm.load_no_bias(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
+        else:
+            self.norm_f = nn.LayerNorm.load(
+                prefix="transformer.norm_f", weights=weights, eps=EPS
+            )
        self.is_causal = not self.prefix_lm
        self._attn_bias_initialized = False
        self.attn_bias = None
@ -787,8 +831,9 @@ class MPTModel(MPTPreTrainedModel):
                    if config.verbose:
                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
                    module.register_parameter("bias", None)
-        if config.verbose and config.verbose > 2:
-            print(self)
+        if hasattr(self.config, "verbose"):
+            if config.verbose and config.verbose > 2:
+                print(self)
        if "verbose" not in self.config.init_config:
            self.config.init_config["verbose"] = self.config.verbose
        if self.config.init_config["verbose"] > 1: