From a3c45da0a44d6a727de76a72290d82a79b079f4b Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 31 Jan 2024 10:28:58 +0000 Subject: [PATCH] Improvments within mamba. --- .../models/test_fused_kernel_mamba.py | 14 ++++----- .../models/causal_lm.py | 4 +-- .../models/custom_modeling/mamba_modeling.py | 10 ++++--- server/text_generation_server/models/mamba.py | 30 ++++++++----------- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/integration-tests/models/test_fused_kernel_mamba.py b/integration-tests/models/test_fused_kernel_mamba.py index 0a449332..5d6aa81f 100644 --- a/integration-tests/models/test_fused_kernel_mamba.py +++ b/integration-tests/models/test_fused_kernel_mamba.py @@ -51,12 +51,12 @@ async def test_fused_kernel_mamba_all_params(fused_kernel_mamba, response_snapsh # TODO: fix `Expected x0.dim() == 2 to be true, but got false.` # 94: `hidden_states, _ = self.layer_norm(hidden_states.squeeze(0))` # NOTE: the fast layer norm has strict requirements on the input shape -# @pytest.mark.asyncio -# @pytest.mark.private -# async def test_fused_kernel_mamba_load(fused_kernel_mamba, generate_load, response_snapshot): -# responses = await generate_load(fused_kernel_mamba, "Test request", max_new_tokens=10, n=4) +@pytest.mark.asyncio +@pytest.mark.private +async def test_fused_kernel_mamba_load(fused_kernel_mamba, generate_load, response_snapshot): + responses = await generate_load(fused_kernel_mamba, "Test request", max_new_tokens=10, n=4) -# assert len(responses) == 4 -# assert all([r.generated_text == responses[0].generated_text for r in responses]) + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) -# assert responses == response_snapshot + assert responses == response_snapshot diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index 7b10256c..39b3f49b 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -311,8 +311,8 @@ class CausalLMBatch(Batch): end_index = start_index + len(batch) # We only concatenate batches that did at least one step - if batch.past_key_values is None: - raise ValueError("only concatenate prefilled batches") + # if batch.past_key_values is None: + # raise ValueError("only concatenate prefilled batches") # Create empty tensor # input_ids is always of shape [batch_size, 1] diff --git a/server/text_generation_server/models/custom_modeling/mamba_modeling.py b/server/text_generation_server/models/custom_modeling/mamba_modeling.py index ca5f9765..42da669a 100644 --- a/server/text_generation_server/models/custom_modeling/mamba_modeling.py +++ b/server/text_generation_server/models/custom_modeling/mamba_modeling.py @@ -91,8 +91,9 @@ class ResidualBlock(nn.Module): hidden_states: torch.Tensor, ): residual = hidden_states - hidden_states, _ = self.layer_norm(hidden_states.squeeze(0)) - hidden_states = residual + self.mamba_block(hidden_states.unsqueeze(0)) + shape = hidden_states.shape + hidden_states, _ = self.layer_norm(hidden_states.view(-1, shape[-1])) + hidden_states = residual + self.mamba_block(hidden_states.view(*shape)) return hidden_states class MambaModel(nn.Module): @@ -114,5 +115,6 @@ class MambaModel(nn.Module): for block in self.blocks: hidden_states = block(hidden_states) - final_hidden_states, _ = self.norm_f(hidden_states.squeeze(0)) - return self.lm_head(final_hidden_states.unsqueeze(0)), input_ids + shape = hidden_states.shape + final_hidden_states, _ = self.norm_f(hidden_states.view(-1, shape[-1])) + return self.lm_head(final_hidden_states.view(*shape)), input_ids diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py index 05a5b99e..13d77abd 100644 --- a/server/text_generation_server/models/mamba.py +++ b/server/text_generation_server/models/mamba.py @@ -30,23 +30,9 @@ from text_generation_server.utils.tokens import batch_top_tokens, Sampling class MambaCausalLMBatch(CausalLMBatch): - past_input_ids: Optional[torch.Tensor] + pass - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.past_input_ids = None - @classmethod - def from_pb( - cls, - pb: generate_pb2.Batch, - tokenizer: PreTrainedTokenizerBase, - dtype: torch.dtype, - device: torch.device, - ) -> "CausalLMBatch": - batch = super().from_pb(pb=pb, tokenizer=tokenizer, dtype=dtype, device=device) - batch.keys_head_dim_last = False - return batch class Mamba(Model): @@ -119,7 +105,7 @@ class Mamba(Model): def generate_token(self, batch) -> Tuple[List[Any], Optional[Any], Tuple[int, int]]: start = time.time_ns() - input_ids = batch.past_input_ids if batch.past_input_ids is not None else batch.input_ids + input_ids = batch.input_ids logits, past_input_ids = self.model(input_ids)[:2] @@ -151,6 +137,8 @@ class Mamba(Model): ) # For each member of the batch + next_token_ids = [] + kept_batch_ids = [] for i, ( request, input_length, @@ -235,7 +223,8 @@ class Mamba(Model): ) else: prefill_tokens = None - past_input_ids = torch.cat([past_input_ids, next_token_id], dim=1) + next_token_ids.append(next_token_id) + kept_batch_ids.append(i) if top_n_tokens > 0: @@ -278,13 +267,18 @@ class Mamba(Model): batch.read_offsets[i] = read_offset batch.max_input_length = max(batch.max_input_length, new_input_length) + # Merge all new tokens + if next_token_ids: + next_token_ids = torch.cat(next_token_ids, dim=0) + past_input_ids = torch.cat([past_input_ids[kept_batch_ids], next_token_ids], dim=1) + # We finished all generations in the batch; there is no next batch if stopped: forward_ns = start_decode - start decode_ns = time.time_ns() - start_decode return generations, None, (forward_ns, decode_ns) - batch.past_input_ids = past_input_ids + batch.input_ids = input_ids forward_ns = start_decode - start decode_ns = time.time_ns() - start_decode