From b4e096c08005006d7bfac7b45ec7dd6aa7f162e5 Mon Sep 17 00:00:00 2001
From: Miquel Farre <miquel.farre@huggingface.co>
Date: Thu, 14 Nov 2024 10:48:18 +0000
Subject: [PATCH] connecting video to qwen2

---
 server/requirements_cuda.txt                          | 1 +
 server/text_generation_server/models/vlm_causal_lm.py | 3 +++
 2 files changed, 4 insertions(+)
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index 13863650..341c291c 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -45,6 +45,7 @@ scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
 setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
+torchvision==0.19.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
 transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index aa0fe107..64e10a3a 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -211,6 +211,9 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
                         processor, image_inputs, config, image_id
                     )
                     image_id += 1
+                elif chunk_type == "video" and config.model_type == "qwen2_vl":
+                    # Based on Qwen2VL's video token format
+                    full_text += f"<video>{chunk.video}</video>"
 
             full_text = image_text_replacement_fixup(config, full_text)