fix: LlamaTokenizerFast to AutoTokenizer at flash_mistral.py (#1637)

# What does this PR do? A few cases where you're using a mistral structure or mixtral structure but not a llama tokenizer, why not make it to call the AutoTokenizer in exception handling. Similar PR #619 @Narsil
2024-03-23 01:13:13 +09:00 · 2024-03-23 01:13:13 +09:00 · 66914f7b19
parent 08e9181418
commit 66914f7b19
1 changed files with 17 additions and 8 deletions
--- a/server/text_generation_server/models/flash_mistral.py
+++ b/server/text_generation_server/models/flash_mistral.py
@ -6,7 +6,7 @@ import numpy as np
 from dataclasses import dataclass
 from opentelemetry import trace
-from transformers import PreTrainedTokenizerBase
+from transformers import PreTrainedTokenizerBase, AutoTokenizer
 from transformers.models.llama import LlamaTokenizerFast
 from typing import Optional, Tuple, Type
@ -317,6 +317,7 @@ class BaseFlashMistral(FlashCausalLM):
        else:
            raise NotImplementedError("FlashMistral is only available on GPU")
        try:
            tokenizer = LlamaTokenizerFast.from_pretrained(
                model_id,
                revision=revision,
@ -324,6 +325,14 @@ class BaseFlashMistral(FlashCausalLM):
                truncation_side="left",
                trust_remote_code=trust_remote_code,
            )
        except Exception:
            tokenizer = AutoTokenizer.from_pretrained(
                model_id,
                revision=revision,
                padding_side="left",
                truncation_side="left",
                trust_remote_code=trust_remote_code,
            )
        config = config_cls.from_pretrained(
            model_id, revision=revision, trust_remote_code=trust_remote_code