Fixing mistral nemo. (#2276)

2024-07-23 11:16:03 +02:00 · 2024-07-23 11:16:03 +02:00 · abc32537ea
parent 4700465192
commit abc32537ea
3 changed files with 10 additions and 4 deletions
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -762,8 +762,6 @@ def get_model(
                default_dtype=torch.bfloat16,
                trust_remote_code=trust_remote_code,
                lora_adapter_ids=lora_adapter_ids,
-                # hidden_size / num_attention_heads is wrong in `google/gemma-2-9b-it`
-                head_size=config_dict["head_dim"],
            )
        elif sharded:
            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Gemma2"))
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@ -117,6 +117,9 @@ class MistralAttention(torch.nn.Module):
        )
        self.num_heads = config.num_attention_heads
        self.hidden_size = config.hidden_size
+        if hasattr(config, "head_dim"):
+            self.head_size = config.head_dim
+        else:
            self.head_size = self.hidden_size // self.num_heads

        self.rotary_emb = PositionRotaryEmbedding.static(
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@ -925,6 +925,11 @@ class FlashCausalLM(Model):
        assert self.num_kv_heads > 0

        if head_size is None:
+            # Some models use GQA and different sizes for o_proj
+            # and q_proj, that allows for that.
+            if hasattr(config, "head_dim"):
+                self.head_size = config.head_dim
+            else:
                self.head_size = config.hidden_size // config.num_attention_heads
        else:
            self.head_size = head_size