From b4e096c08005006d7bfac7b45ec7dd6aa7f162e5 Mon Sep 17 00:00:00 2001 From: Miquel Farre Date: Thu, 14 Nov 2024 10:48:18 +0000 Subject: [PATCH] connecting video to qwen2 --- server/requirements_cuda.txt | 1 + server/text_generation_server/models/vlm_causal_lm.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index 13863650..341c291c 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -45,6 +45,7 @@ scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" +torchvision==0.19.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" transformers==4.46.0 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index aa0fe107..64e10a3a 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -211,6 +211,9 @@ class VlmCausalLMBatch(FlashCausalLMBatch): processor, image_inputs, config, image_id ) image_id += 1 + elif chunk_type == "video" and config.model_type == "qwen2_vl": + # Based on Qwen2VL's video token format + full_text += f"" full_text = image_text_replacement_fixup(config, full_text)