From 5ad39dd3c35f751c8b553ebb802aa2d996d5c9c5 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Wed, 24 Jul 2024 16:39:08 +0800 Subject: [PATCH] fix crash in multi-modal (#2245) * fix crash in multi-modal Signed-off-by: Wang, Yi A * update according to review comment Signed-off-by: Wang, Yi A * fix llava_next regression in latest main Signed-off-by: Wang, Yi A --------- Signed-off-by: Wang, Yi A --- .../models/custom_modeling/flash_llama_modeling.py | 2 +- .../text_generation_server/models/custom_modeling/idefics2.py | 1 + .../text_generation_server/models/custom_modeling/llava_next.py | 1 + server/text_generation_server/models/vlm_causal_lm.py | 2 ++ 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index f7980d2d..3e8e67ab 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -424,7 +424,7 @@ class FlashLlamaModel(torch.nn.Module): FlashLlamaLayer( index=0, prefix=( - "model.layers.0" if not prefix else "{prefix}.model.layers.0" + "model.layers.0" if not prefix else f"{prefix}.model.layers.0" ), config=config, weights=weights, diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py index 735c3899..f540b7af 100644 --- a/server/text_generation_server/models/custom_modeling/idefics2.py +++ b/server/text_generation_server/models/custom_modeling/idefics2.py @@ -832,6 +832,7 @@ class Idefics2ForConditionalGeneration(nn.Module): max_s=max_s, true_max_s=max_s, prefill_cache_indices=None, + adapter_data=adapter_data, ) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py index 567131ef..e154d805 100644 --- a/server/text_generation_server/models/custom_modeling/llava_next.py +++ b/server/text_generation_server/models/custom_modeling/llava_next.py @@ -280,6 +280,7 @@ class LlavaNextForConditionalGeneration(nn.Module): max_s=max_s, true_max_s=max_s, prefill_cache_indices=None, + adapter_data=adapter_data, ) if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 308d5a3d..7de54aa4 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -14,6 +14,7 @@ from text_generation_server.models.flash_causal_lm import ( ) from text_generation_server.utils.log import log_master from transformers import AutoProcessor +from text_generation_server.layers.attention import Seqlen tracer = trace.get_tracer(__name__) @@ -348,6 +349,7 @@ class VlmCausalLM(FlashCausalLM): else: cuda_graph = None if cu_seqlen_prefill is not None or cuda_graph is None: + input_lengths = Seqlen(input_lengths=input_lengths) logits, speculative_logits = self.model.forward( input_ids=input_ids, position_ids=position_ids,