Support tied embeddings in 0.5B and 1.5B Qwen2 models (#2313)

2024-07-26 14:57:24 +02:00 · 2024-07-26 14:57:24 +02:00 · 4b49c50f4c
parent 3905f854ed
commit 4b49c50f4c
1 changed files with 10 additions and 6 deletions
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@ -262,6 +262,9 @@ class Qwen2Layer(nn.Module):
 class Qwen2Model(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        prefix = f"{prefix}.model" if prefix else "model"
        process_group = weights.process_group
        self.tp_rank = process_group.rank()
        self.tp_world_size = process_group.size()
@ -335,15 +338,16 @@ class Qwen2ForCausalLM(torch.nn.Module):
    def __init__(self, prefix: str, config, weights):
        super().__init__()
        if not prefix:
            prefix = "model"
        else:
            prefix = f"{prefix}.model"
        self.model = Qwen2Model(prefix, config, weights)
        if config.tie_word_embeddings:
            suffix = "model.embed_tokens"
        else:
            suffix = "lm_head"
        self.lm_head = SpeculativeHead.load(
            config,
-            prefix="lm_head",
+            prefix=f"{prefix}.{suffix}" if prefix else suffix,
            weights=weights,
        )
        self.max_past = config.sliding_window