feat(server): Add Non flash MPT. (#514)

# What does this PR do? This adds a non flash version of MPT. Flash is harder because we need to create a bias ready cuda kernel of flash attention. Fixes https://github.com/huggingface/text-generation-inference/issues/361 Fixes https://github.com/huggingface/text-generation-inference/issues/491 Fixes https://github.com/huggingface/text-generation-inference/issues/290
2023-07-03 13:01:46 +02:00 · 2023-07-03 13:01:46 +02:00 · 1da07e85aa
parent e28a809004
commit 1da07e85aa
10 changed files with 2011 additions and 1 deletions
--- a/integration-tests/models/snapshots/test_mpt/test_mpt.json
+++ b/integration-tests/models/snapshots/test_mpt/test_mpt.json
@ -0,0 +1,140 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 17,
    "prefill": [
      {
        "id": 1276,
        "logprob": null,
        "text": "What"
      },
      {
        "id": 310,
        "logprob": -1.5117188,
        "text": " is"
      },
      {
        "id": 18147,
        "logprob": -8.96875,
        "text": " Deep"
      },
      {
        "id": 20727,
        "logprob": -1.953125,
        "text": " Learning"
      },
      {
        "id": 32,
        "logprob": -0.94189453,
        "text": "?"
      }
    ],
    "seed": null,
    "tokens": [
      {
        "id": 428,
        "logprob": -1.5830078,
        "special": false,
        "text": " -"
      },
      {
        "id": 18147,
        "logprob": -3.3105469,
        "special": false,
        "text": " Deep"
      },
      {
        "id": 20727,
        "logprob": -0.3215332,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 187,
        "logprob": -2.5566406,
        "special": false,
        "text": "\n"
      },
      {
        "id": 30763,
        "logprob": -1.6074219,
        "special": false,
        "text": "Deep"
      },
      {
        "id": 20727,
        "logprob": -0.69628906,
        "special": false,
        "text": " Learning"
      },
      {
        "id": 310,
        "logprob": -0.6923828,
        "special": false,
        "text": " is"
      },
      {
        "id": 247,
        "logprob": -0.5263672,
        "special": false,
        "text": " a"
      },
      {
        "id": 749,
        "logprob": -1.8544922,
        "special": false,
        "text": " sub"
      },
      {
        "id": 3423,
        "logprob": -0.6118164,
        "special": false,
        "text": "field"
      },
      {
        "id": 273,
        "logprob": -0.055877686,
        "special": false,
        "text": " of"
      },
      {
        "id": 5145,
        "logprob": -1.0537109,
        "special": false,
        "text": " machine"
      },
      {
        "id": 4715,
        "logprob": -0.0115737915,
        "special": false,
        "text": " learning"
      },
      {
        "id": 326,
        "logprob": -0.9111328,
        "special": false,
        "text": " that"
      },
      {
        "id": 4648,
        "logprob": -1.4589844,
        "special": false,
        "text": " uses"
      },
      {
        "id": 13345,
        "logprob": -1.4853516,
        "special": false,
        "text": " artificial"
      },
      {
        "id": 11454,
        "logprob": -0.021636963,
        "special": false,
        "text": " neural"
      }
    ]
  },
  "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
 }
--- a/integration-tests/models/snapshots/test_mpt/test_mpt_load.json
+++ b/integration-tests/models/snapshots/test_mpt/test_mpt_load.json
@ -0,0 +1,562 @@
 [
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5117188,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.96875,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.953125,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.94189453,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5830078,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3183594,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.32617188,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5742188,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.6015625,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.67822266,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5395508,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.8623047,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6020508,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.0552063,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0742188,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011405945,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9165039,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4501953,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.4960938,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.02116394,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  },
  {
    "details": {
      "best_of_sequences": null,
      "finish_reason": "length",
      "generated_tokens": 17,
      "prefill": [
        {
          "id": 1276,
          "logprob": null,
          "text": "What"
        },
        {
          "id": 310,
          "logprob": -1.5,
          "text": " is"
        },
        {
          "id": 18147,
          "logprob": -8.984375,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -1.96875,
          "text": " Learning"
        },
        {
          "id": 32,
          "logprob": -0.93359375,
          "text": "?"
        }
      ],
      "seed": null,
      "tokens": [
        {
          "id": 428,
          "logprob": -1.5800781,
          "special": false,
          "text": " -"
        },
        {
          "id": 18147,
          "logprob": -3.3242188,
          "special": false,
          "text": " Deep"
        },
        {
          "id": 20727,
          "logprob": -0.31835938,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 187,
          "logprob": -2.5644531,
          "special": false,
          "text": "\n"
        },
        {
          "id": 30763,
          "logprob": -1.5957031,
          "special": false,
          "text": "Deep"
        },
        {
          "id": 20727,
          "logprob": -0.69628906,
          "special": false,
          "text": " Learning"
        },
        {
          "id": 310,
          "logprob": -0.68603516,
          "special": false,
          "text": " is"
        },
        {
          "id": 247,
          "logprob": -0.5258789,
          "special": false,
          "text": " a"
        },
        {
          "id": 749,
          "logprob": -1.859375,
          "special": false,
          "text": " sub"
        },
        {
          "id": 3423,
          "logprob": -0.6166992,
          "special": false,
          "text": "field"
        },
        {
          "id": 273,
          "logprob": -0.056762695,
          "special": false,
          "text": " of"
        },
        {
          "id": 5145,
          "logprob": -1.0703125,
          "special": false,
          "text": " machine"
        },
        {
          "id": 4715,
          "logprob": -0.011428833,
          "special": false,
          "text": " learning"
        },
        {
          "id": 326,
          "logprob": -0.9213867,
          "special": false,
          "text": " that"
        },
        {
          "id": 4648,
          "logprob": -1.4726562,
          "special": false,
          "text": " uses"
        },
        {
          "id": 13345,
          "logprob": -1.5039062,
          "special": false,
          "text": " artificial"
        },
        {
          "id": 11454,
          "logprob": -0.021652222,
          "special": false,
          "text": " neural"
        }
      ]
    },
    "generated_text": " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
  }
 ]
--- a/integration-tests/models/test_mpt.py
+++ b/integration-tests/models/test_mpt.py
@ -0,0 +1,48 @@
 import pytest
@pytest.fixture(scope="module")
 def mpt_sharded_handle(launcher):
    with launcher("mosaicml/mpt-7b", num_shard=2) as handle:
        yield handle
@pytest.fixture(scope="module")
 async def mpt_sharded(mpt_sharded_handle):
    await mpt_sharded_handle.health(300)
    return mpt_sharded_handle.client
@pytest.mark.asyncio
 async def test_mpt(mpt_sharded, response_snapshot):
    response = await mpt_sharded.generate(
        "What is Deep Learning?",
        max_new_tokens=17,
        decoder_input_details=True,
    )
    assert response.details.generated_tokens == 17
    assert (
        response.generated_text
        == " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
    )
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_mpt_load(mpt_sharded, generate_load, response_snapshot):
    responses = await generate_load(
        mpt_sharded,
        "What is Deep Learning?",
        max_new_tokens=17,
        n=4,
    )
    assert len(responses) == 4
    assert all([r.generated_text == responses[0].generated_text for r in responses])
    assert (
        responses[0].generated_text
        == " - Deep Learning\nDeep Learning is a subfield of machine learning that uses artificial neural"
    )
    assert responses == response_snapshot
--- a/server/poetry.lock
+++ b/server/poetry.lock
@ -187,6 +187,17 @@ wrapt = ">=1.10,<2"
 [package.extras]
 dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 [[package]]
 name = "einops"
 version = "0.6.1"
 description = "A new flavour of deep learning operations"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "einops-0.6.1-py3-none-any.whl", hash = "sha256:99149e46cc808956b174932fe563d920db4d6e5dadb8c6ecdaa7483b7ef7cfc3"},
    {file = "einops-0.6.1.tar.gz", hash = "sha256:f95f8d00f4ded90dbc4b19b6f98b177332614b0357dde66997f3ae5d474dc8c8"},
 ]
 [[package]]
 name = "exceptiongroup"
 version = "1.1.1"
@ -1586,4 +1597,4 @@ bnb = ["bitsandbytes"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "54ecacb32d699cb1298c237c4661c1b707f119cf2c27bd54bad7a1ea2ffb8b10"
+content-hash = "3174a211d30bed5990ed5f8418416c951bb6c585153fb51b62809baa89ef07d0"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -27,6 +27,7 @@ sentencepiece = "^0.1.97"
 tokenizers = "0.13.3"
 huggingface-hub = "^0.14.1"
 transformers = "^4.29.2"
 einops = "^0.6.1"
 [tool.poetry.extras]
 accelerate = ["accelerate"]
--- a/server/requirements.txt
+++ b/server/requirements.txt
@ -4,6 +4,7 @@ charset-normalizer==3.1.0 ; python_version >= "3.9" and python_version < "4.0"
 click==8.1.3 ; python_version >= "3.9" and python_version < "4.0"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "4.0"
 einops==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 filelock==3.12.2 ; python_version >= "3.9" and python_version < "4.0"
 fsspec==2023.6.0 ; python_version >= "3.9" and python_version < "4.0"
 googleapis-common-protos==1.59.1 ; python_version >= "3.9" and python_version < "4.0"
--- a/server/text_generation_server/models/init.py
+++ b/server/text_generation_server/models/init.py
@ -10,6 +10,7 @@ from text_generation_server.models.model import Model
 from text_generation_server.models.causal_lm import CausalLM
 from text_generation_server.models.flash_causal_lm import FlashCausalLM
 from text_generation_server.models.bloom import BLOOMSharded
 from text_generation_server.models.mpt import MPTSharded
 from text_generation_server.models.seq2seq_lm import Seq2SeqLM
 from text_generation_server.models.rw import RW
 from text_generation_server.models.opt import OPTSharded
@ -178,6 +179,10 @@ def get_model(
            dtype=dtype,
            trust_remote_code=trust_remote_code,
        )
    elif model_type == "mpt":
        return MPTSharded(
            model_id, revision, quantize=quantize, trust_remote_code=trust_remote_code
        )
    elif model_type == "gpt_neox":
        if FLASH_ATTENTION:
--- a/server/text_generation_server/models/custom_modeling/mpt_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/mpt_modeling.py
--- a/server/text_generation_server/models/mpt.py
+++ b/server/text_generation_server/models/mpt.py
@ -0,0 +1,90 @@
 import torch
 import torch.distributed
 from typing import Optional, Type
 from opentelemetry import trace
 from transformers import AutoTokenizer, PretrainedConfig, PreTrainedTokenizerBase
 from huggingface_hub import hf_hub_download
 import json
 from text_generation_server.models import CausalLM
 from text_generation_server.models.causal_lm import CausalLMBatch
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.custom_modeling.mpt_modeling import (
    MPTForCausalLM,
 )
 from text_generation_server.utils import (
    initialize_torch_distributed,
    weight_files,
    Weights,
 )
 tracer = trace.get_tracer(__name__)
 class MPTCausalLMBatch(CausalLMBatch):
    @classmethod
    def from_pb(
        cls,
        pb: generate_pb2.Batch,
        tokenizer: PreTrainedTokenizerBase,
        dtype: torch.dtype,
        device: torch.device,
    ) -> "CausalLMBatch":
        batch = super().from_pb(pb=pb, tokenizer=tokenizer, dtype=dtype, device=device)
        batch.keys_head_dim_last = False
        return batch
 class MPTSharded(CausalLM):
    def __init__(
        self,
        model_id: str,
        revision: Optional[str] = None,
        quantize: Optional[str] = None,
        trust_remote_code: bool = False,
    ):
        self.process_group, rank, world_size = initialize_torch_distributed()
        if torch.cuda.is_available():
            device = torch.device(f"cuda:{rank}")
            dtype = torch.float16
        else:
            raise NotImplementedError("MPTSharded is only available on GPU")
        tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            revision=revision,
            padding_side="left",
            truncation_side="left",
            trust_remote_code=trust_remote_code,
        )
        tokenizer.pad_token = tokenizer.eos_token
        filename = hf_hub_download(model_id, revision=revision, filename="config.json")
        with open(filename, "r") as f:
            config = json.load(f)
        config = PretrainedConfig(**config)
        config.quantize = quantize
        torch.distributed.barrier(group=self.process_group)
        filenames = weight_files(model_id, revision=revision, extension=".safetensors")
        weights = Weights(filenames, device, dtype, process_group=self.process_group)
        config.quantize = quantize
        model = MPTForCausalLM(config, weights)
        torch.distributed.barrier(group=self.process_group)
        super(CausalLM, self).__init__(
            model=model,
            tokenizer=tokenizer,
            requires_padding=False,
            dtype=dtype,
            device=device,
            rank=rank,
            world_size=world_size,
        )
    @property
    def batch_type(self) -> Type[CausalLMBatch]:
        return MPTCausalLMBatch
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@ -31,7 +31,19 @@ def load_layer_norm(cls, prefix, weights, eps):
    return ln
@classmethod
 def load_layer_norm_no_bias(cls, prefix, weights, eps):
    weight = weights.get_tensor(f"{prefix}.weight")
    with init_empty_weights():
        ln = cls(weight.shape, eps=eps)
    ln.weight = nn.Parameter(weight)
    ln.bias = None
    return ln
 torch.nn.LayerNorm.load = load_layer_norm
 torch.nn.LayerNorm.load_no_bias = load_layer_norm_no_bias
 class FastLinear(nn.Module):