Support tied embeddings in 0.5B and 1.5B Qwen2 models (#2313)

2024-07-26 14:57:24 +02:00 · 2024-07-26 14:57:24 +02:00 · 4b49c50f4c
parent 3905f854ed
commit 4b49c50f4c
1 changed files with 10 additions and 6 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -262,6 +262,9 @@ class Qwen2Layer(nn.Module):
 class Qwen2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
+
+        prefix = f"{prefix}.model" if prefix else "model"
+
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
@ -335,15 +338,16 @@ class Qwen2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()

-        if not prefix:
-            prefix = "model"
-        else:
-            prefix = f"{prefix}.model"
-
        self.model = Qwen2Model(prefix, config, weights)
+
+        if config.tie_word_embeddings:
+            suffix = "model.embed_tokens"
+        else:
+            suffix = "lm_head"
+
        self.lm_head = SpeculativeHead.load(
            config,
-            prefix="lm_head",
+            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )
        self.max_past = config.sliding_window