From 5ad39dd3c35f751c8b553ebb802aa2d996d5c9c5 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Wed, 24 Jul 2024 16:39:08 +0800
Subject: [PATCH] fix crash in multi-modal (#2245)

* fix crash in multi-modal

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* update according to review comment

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

* fix llava_next regression in latest main

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>

---------

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 .../models/custom_modeling/flash_llama_modeling.py              | 2 +-
 .../text_generation_server/models/custom_modeling/idefics2.py   | 1 +
 .../text_generation_server/models/custom_modeling/llava_next.py | 1 +
 server/text_generation_server/models/vlm_causal_lm.py           | 2 ++
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index f7980d2d..3e8e67ab 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -424,7 +424,7 @@ class FlashLlamaModel(torch.nn.Module):
                 FlashLlamaLayer(
                     index=0,
                     prefix=(
-                        "model.layers.0" if not prefix else "{prefix}.model.layers.0"
+                        "model.layers.0" if not prefix else f"{prefix}.model.layers.0"
                     ),
                     config=config,
                     weights=weights,
diff --git a/server/text_generation_server/models/custom_modeling/idefics2.py b/server/text_generation_server/models/custom_modeling/idefics2.py
index 735c3899..f540b7af 100644
--- a/server/text_generation_server/models/custom_modeling/idefics2.py
+++ b/server/text_generation_server/models/custom_modeling/idefics2.py
@@ -832,6 +832,7 @@ class Idefics2ForConditionalGeneration(nn.Module):
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=None,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/custom_modeling/llava_next.py b/server/text_generation_server/models/custom_modeling/llava_next.py
index 567131ef..e154d805 100644
--- a/server/text_generation_server/models/custom_modeling/llava_next.py
+++ b/server/text_generation_server/models/custom_modeling/llava_next.py
@@ -280,6 +280,7 @@ class LlavaNextForConditionalGeneration(nn.Module):
             max_s=max_s,
             true_max_s=max_s,
             prefill_cache_indices=None,
+            adapter_data=adapter_data,
         )
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 308d5a3d..7de54aa4 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -14,6 +14,7 @@ from text_generation_server.models.flash_causal_lm import (
 )
 from text_generation_server.utils.log import log_master
 from transformers import AutoProcessor
+from text_generation_server.layers.attention import Seqlen
 
 tracer = trace.get_tracer(__name__)
 
@@ -348,6 +349,7 @@ class VlmCausalLM(FlashCausalLM):
         else:
             cuda_graph = None
         if cu_seqlen_prefill is not None or cuda_graph is None:
+            input_lengths = Seqlen(input_lengths=input_lengths)
             logits, speculative_logits = self.model.forward(
                 input_ids=input_ids,
                 position_ids=position_ids,