diff --git a/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
new file mode 100644
index 00000000..ab4f3015
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
@@ -0,0 +1,61 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 8,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 2502,
+        "logprob": -1.734375,
+        "special": false,
+        "text": "image"
+      },
+      {
+        "id": 2196,
+        "logprob": -0.5756836,
+        "special": false,
+        "text": " result"
+      },
+      {
+        "id": 604,
+        "logprob": -0.007843018,
+        "special": false,
+        "text": " for"
+      },
+      {
+        "id": 12254,
+        "logprob": -1.7167969,
+        "special": false,
+        "text": " chicken"
+      },
+      {
+        "id": 611,
+        "logprob": -0.17053223,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 573,
+        "logprob": -0.7626953,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 8318,
+        "logprob": -0.02709961,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 1,
+        "logprob": -0.20739746,
+        "special": true,
+        "text": "<eos>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "image result for chicken on the beach"
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json
new file mode 100644
index 00000000..a4727707
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics/test_idefics_two_images.json
@@ -0,0 +1,85 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "eos_token",
+    "generated_tokens": 12,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 450,
+        "logprob": -0.26342773,
+        "special": false,
+        "text": " The"
+      },
+      {
+        "id": 21282,
+        "logprob": -0.01838684,
+        "special": false,
+        "text": " cow"
+      },
+      {
+        "id": 322,
+        "logprob": -0.18041992,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 521,
+        "logprob": -0.62841797,
+        "special": false,
+        "text": " ch"
+      },
+      {
+        "id": 21475,
+        "logprob": -0.0037956238,
+        "special": false,
+        "text": "icken"
+      },
+      {
+        "id": 526,
+        "logprob": -0.018737793,
+        "special": false,
+        "text": " are"
+      },
+      {
+        "id": 373,
+        "logprob": -1.0820312,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 263,
+        "logprob": -0.5083008,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 25695,
+        "logprob": -0.07128906,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 29889,
+        "logprob": -0.12573242,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 32002,
+        "logprob": -0.0029792786,
+        "special": true,
+        "text": "<end_of_utterance>"
+      },
+      {
+        "id": 2,
+        "logprob": -0.00024962425,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " The cow and chicken are on a beach."
+}
diff --git a/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
new file mode 100644
index 00000000..86c95b29
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_idefics2/test_flash_idefics2_two_images.json
@@ -0,0 +1,133 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 20,
+    "prefill": [],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 415,
+        "logprob": -0.04421997,
+        "special": false,
+        "text": " The"
+      },
+      {
+        "id": 12072,
+        "logprob": -0.13500977,
+        "special": false,
+        "text": " cow"
+      },
+      {
+        "id": 349,
+        "logprob": -0.06750488,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 6328,
+        "logprob": -0.6352539,
+        "special": false,
+        "text": " standing"
+      },
+      {
+        "id": 356,
+        "logprob": -0.16186523,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 272,
+        "logprob": -0.5078125,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 10305,
+        "logprob": -0.017913818,
+        "special": false,
+        "text": " beach"
+      },
+      {
+        "id": 304,
+        "logprob": -1.5205078,
+        "special": false,
+        "text": " and"
+      },
+      {
+        "id": 272,
+        "logprob": -0.029174805,
+        "special": false,
+        "text": " the"
+      },
+      {
+        "id": 13088,
+        "logprob": -0.003479004,
+        "special": false,
+        "text": " chicken"
+      },
+      {
+        "id": 349,
+        "logprob": -0.0035095215,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 6398,
+        "logprob": -0.3088379,
+        "special": false,
+        "text": " sitting"
+      },
+      {
+        "id": 356,
+        "logprob": -0.027755737,
+        "special": false,
+        "text": " on"
+      },
+      {
+        "id": 264,
+        "logprob": -0.31884766,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 17972,
+        "logprob": -0.047943115,
+        "special": false,
+        "text": " pile"
+      },
+      {
+        "id": 302,
+        "logprob": -0.0002925396,
+        "special": false,
+        "text": " of"
+      },
+      {
+        "id": 2445,
+        "logprob": -0.02935791,
+        "special": false,
+        "text": " money"
+      },
+      {
+        "id": 28723,
+        "logprob": -0.031219482,
+        "special": false,
+        "text": "."
+      },
+      {
+        "id": 32002,
+        "logprob": -0.00034475327,
+        "special": true,
+        "text": "<end_of_utterance>"
+      },
+      {
+        "id": 2,
+        "logprob": -1.1920929e-07,
+        "special": true,
+        "text": "</s>"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money."
+}
diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py
index d4e83c9f..6be1750c 100644
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@@ -22,6 +22,12 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
     return flash_pali_gemma_handle.client
 
 
+def get_chicken():
+    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
 def get_cow_beach():
     with open("integration-tests/images/cow_beach.png", "rb") as image_file:
         encoded_string = base64.b64encode(image_file.read())
@@ -37,3 +43,20 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
 
     assert response.generated_text == "beach"
     assert response == response_snapshot
+
+
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await flash_pali_gemma.generate(
+        f"caption![]({chicken})![]({cow_beach})\n",
+        max_new_tokens=20,
+    )
+    # Is PaliGemma not able to handle two separate images? At least we
+    # get output showing that both images are used.
+    assert (
+        response.generated_text == "image result for chicken on the beach"
+    ), f"{repr(response.generated_text)}"
+    assert response == response_snapshot
diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py
index aeeaffa1..ac807b76 100644
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@@ -23,6 +23,12 @@ def get_chicken():
     return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
 @pytest.mark.asyncio
 async def test_idefics(idefics, response_snapshot):
     chicken = get_chicken()
@@ -39,6 +45,21 @@ async def test_idefics(idefics, response_snapshot):
     assert response == response_snapshot
 
 
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_idefics_two_images(idefics, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await idefics.generate(
+        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
+        max_new_tokens=20,
+    )
+    assert (
+        response.generated_text == " The cow and chicken are on a beach."
+    ), f"{repr(response.generated_text)}"
+    assert response == response_snapshot
+
+
 @pytest.mark.asyncio
 async def test_idefics_load(idefics, generate_load, response_snapshot):
     chicken = get_chicken()
diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py
index d34cce34..9aaf6d8a 100644
--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@@ -9,6 +9,12 @@ def get_chicken():
     return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
+def get_cow_beach():
+    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
 @pytest.fixture(scope="module")
 def flash_idefics2_next_handle(launcher):
     with launcher(
@@ -38,6 +44,23 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
     assert response == response_snapshot
 
 
+@pytest.mark.asyncio
+@pytest.mark.private
+async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
+    chicken = get_chicken()
+    cow_beach = get_cow_beach()
+    response = await flash_idefics2_next.generate(
+        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
+        max_new_tokens=20,
+    )
+    assert (
+        response.generated_text
+        == " The cow is standing on the beach and the chicken is sitting on a pile of money."
+    ), f"{repr(response.generated_text)}"
+    assert response.details.generated_tokens == 20
+    assert response == response_snapshot
+
+
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot):
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 59a6fab1..8b5819d1 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -53,7 +53,9 @@ def image_text_replacement(image_input, config, image_id) -> str:
         num_features = get_number_of_features(height, width, config)
         from loguru import logger
 
-        logger.info(f"Found {num_features} in image of resolution {height}x{width}")
+        logger.info(
+            f"Found {num_features} features in image of resolution {height}x{width}"
+        )
         return "<image>" * num_features
 
     elif config.model_type == "paligemma":
@@ -133,23 +135,41 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
     def batch_tokenized_inputs(
         cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
     ):
+        # Process images first. We need all of them so that the processor
+        # can make the image splits the same size. And we need the final
+        # sizes to insert correct number of image tokens.
+        images = []
+        for r in requests:
+            for chunk in r.input_chunks.chunks:
+                chunk_type = chunk.WhichOneof("chunk")
+                if chunk_type == "text":
+                    pass
+                elif chunk_type == "image":
+                    image = Image.open(BytesIO(chunk.image.data))
+                    if config.model_type == "llava_next":
+                        images.append(image)
+                    else:
+                        images.append([image])
+                else:
+                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+
+        if images:
+            image_inputs = processor.image_processor(images, return_tensors="pt")
+        else:
+            image_inputs = None
+
         batch_inputs = []
-        image_inputs = []
         max_truncation = 0
+        image_id = 0
         for r in requests:
             full_text = ""
-            image_id = 0
             for chunk in r.input_chunks.chunks:
                 chunk_type = chunk.WhichOneof("chunk")
                 if chunk_type == "text":
                     full_text += chunk.text
                 elif chunk_type == "image":
-                    image = Image.open(BytesIO(chunk.image.data))
-                    image_input = processor.image_processor(image, return_tensors="pt")
-                    full_text += image_text_replacement(image_input, config, image_id)
-                    image_inputs.append(image_input)
-                else:
-                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
+                    full_text += image_text_replacement(image_inputs, config, image_id)
+                    image_id += 1
 
             batch_inputs.append(full_text)
             max_truncation = max(max_truncation, r.truncate)
@@ -160,24 +180,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
             max_length=max_truncation,
             add_special_tokens=not config.model_type == "paligemma",
         )["input_ids"]
-        if image_inputs:
-            image_input = image_inputs[0]
-            new_image_inputs = {
-                "pixel_values": torch.cat(
-                    [img["pixel_values"] for img in image_inputs], dim=0
-                ),
-            }
-            if "pixel_attention_mask" in image_input:
-                new_image_inputs["pixel_attention_mask"] = torch.cat(
-                    [img["pixel_attention_mask"] for img in image_inputs], dim=0
-                )
-            if "image_sizes" in image_input:
-                new_image_inputs["image_sizes"] = torch.cat(
-                    [img["image_sizes"] for img in image_inputs], dim=0
-                )
-            image_inputs = new_image_inputs
-        else:
-            image_inputs = None
+
         return batch_tokenized_inputs, image_inputs
 
     @classmethod