Merge branch 'main' into lora-internal

2024-06-18 09:50:41 -04:00 · 2024-06-18 09:50:41 -04:00 · 224455f389
parent 1104885f00 11ea9ce002
commit 224455f389
20 changed files with 667 additions and 52 deletions
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@ -51,16 +51,19 @@ jobs:
    steps:
      - name: Checkout repository
        uses: actions/checkout@v3
-      - name: Initialize Docker Buildx
+
        uses: docker/setup-buildx-action@v2.0.0
        with:
          install: true
      - name: Inject slug/short variables
        uses: rlespinasse/github-slug-action@v4.4.1
      - name: Tailscale
        uses: huggingface/tailscale-action@main
        with:
          authkey: ${{ secrets.TAILSCALE_AUTHKEY }}
          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
      - name: Initialize Docker Buildx
        uses: docker/setup-buildx-action@v2.0.0
        with:
          install: true
      - name: Login to GitHub Container Registry
        if: github.event_name != 'pull_request'
        uses: docker/login-action@v2
@ -121,6 +124,7 @@ jobs:
            DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}${{ matrix.label }}
          tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
          labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
          network: host
          cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min
          cache-to: type=registry,ref=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:cache${{ matrix.label }},mode=min
      - name: Set up Python
@ -139,3 +143,8 @@ jobs:
          export DOCKER_IMAGE=registry.internal.huggingface.tech/api-inference/community/text-generation-inference:sha-${{ env.GITHUB_SHA_SHORT }}
          export HUGGING_FACE_HUB_TOKEN=${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          pytest -s -vv integration-tests
      - name: Tailscale Wait
        if: ${{ failure() || runner.debug == '1' }}
        uses: huggingface/tailscale-action@main
        with:
          waitForSSH: true
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@ -33,9 +33,9 @@ jobs:
      - name: Install Rust
        uses: actions-rs/toolchain@v1
        with:
-          # Released on: 02 May, 2024
+          # Released on: June 13, 2024
-          # https://releases.rs/docs/1.78.0/
+          # https://releases.rs/docs/1.79.0/
-          toolchain: 1.78.0
+          toolchain: 1.79.0
          override: true
          components: rustfmt, clippy
      - name: Install Protoc
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,133 @@
 # Contributor Covenant Code of Conduct
 ## Our Pledge
 We as members, contributors, and leaders pledge to make participation in our
 community a harassment-free experience for everyone, regardless of age, body
 size, visible or invisible disability, ethnicity, sex characteristics, gender
 identity and expression, level of experience, education, socio-economic status,
 nationality, personal appearance, race, caste, color, religion, or sexual
 identity and orientation.
 We pledge to act and interact in ways that contribute to an open, welcoming,
 diverse, inclusive, and healthy community.
 ## Our Standards
 Examples of behavior that contributes to a positive environment for our
 community include:
 * Demonstrating empathy and kindness toward other people
 * Being respectful of differing opinions, viewpoints, and experiences
 * Giving and gracefully accepting constructive feedback
 * Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
 * Focusing on what is best not just for us as individuals, but for the overall
  community
 Examples of unacceptable behavior include:
 * The use of sexualized language or imagery, and sexual attention or advances of
  any kind
 * Trolling, insulting or derogatory comments, and personal or political attacks
 * Public or private harassment
 * Publishing others' private information, such as a physical or email address,
  without their explicit permission
 * Other conduct which could reasonably be considered inappropriate in a
  professional setting
 ## Enforcement Responsibilities
 Community leaders are responsible for clarifying and enforcing our standards of
 acceptable behavior and will take appropriate and fair corrective action in
 response to any behavior that they deem inappropriate, threatening, offensive,
 or harmful.
 Community leaders have the right and responsibility to remove, edit, or reject
 comments, commits, code, wiki edits, issues, and other contributions that are
 not aligned to this Code of Conduct, and will communicate reasons for moderation
 decisions when appropriate.
 ## Scope
 This Code of Conduct applies within all community spaces, and also applies when
 an individual is officially representing the community in public spaces.
 Examples of representing our community include using an official e-mail address,
 posting via an official social media account, or acting as an appointed
 representative at an online or offline event.
 ## Enforcement
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
 feedback@huggingface.co.
 All complaints will be reviewed and investigated promptly and fairly.
 All community leaders are obligated to respect the privacy and security of the
 reporter of any incident.
 ## Enforcement Guidelines
 Community leaders will follow these Community Impact Guidelines in determining
 the consequences for any action they deem in violation of this Code of Conduct:
 ### 1. Correction
 **Community Impact**: Use of inappropriate language or other behavior deemed
 unprofessional or unwelcome in the community.
 **Consequence**: A private, written warning from community leaders, providing
 clarity around the nature of the violation and an explanation of why the
 behavior was inappropriate. A public apology may be requested.
 ### 2. Warning
 **Community Impact**: A violation through a single incident or series of
 actions.
 **Consequence**: A warning with consequences for continued behavior. No
 interaction with the people involved, including unsolicited interaction with
 those enforcing the Code of Conduct, for a specified period of time. This
 includes avoiding interactions in community spaces as well as external channels
 like social media. Violating these terms may lead to a temporary or permanent
 ban.
 ### 3. Temporary Ban
 **Community Impact**: A serious violation of community standards, including
 sustained inappropriate behavior.
 **Consequence**: A temporary ban from any sort of interaction or public
 communication with the community for a specified period of time. No public or
 private interaction with the people involved, including unsolicited interaction
 with those enforcing the Code of Conduct, is allowed during this period.
 Violating these terms may lead to a permanent ban.
 ### 4. Permanent Ban
 **Community Impact**: Demonstrating a pattern of violation of community
 standards, including sustained inappropriate behavior, harassment of an
 individual, or aggression toward or disparagement of classes of individuals.
 **Consequence**: A permanent ban from any sort of public interaction within the
 community.
 ## Attribution
 This Code of Conduct is adapted from the [Contributor Covenant][homepage],
 version 2.1, available at
 [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
 Community Impact Guidelines were inspired by
 [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
 For answers to common questions about this code of conduct, see the FAQ at
 [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
 [https://www.contributor-covenant.org/translations][translations].
 [homepage]: https://www.contributor-covenant.org
 [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
 [Mozilla CoC]: https://github.com/mozilla/diversity
 [FAQ]: https://www.contributor-covenant.org/faq
 [translations]: https://www.contributor-covenant.org/translations
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,120 @@
 <!---
 Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 # Contribute to text-generation-inference
 Everyone is welcome to contribute, and we value everybody's contribution. Code
 contributions are not the only way to help the community. Answering questions, helping
 others, and improving the documentation are also immensely valuable.
 It also helps us if you spread the word! Reference the library in blog posts
 about the awesome projects it made possible, shout out on Twitter every time it has
 helped you, or simply ⭐️ the repository to say thank you.
 However you choose to contribute, please be mindful and respect our
 [code of conduct](https://github.com/huggingface/text-generation-inference/blob/main/CODE_OF_CONDUCT.md).
 **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
 ## Ways to contribute
 There are several ways you can contribute to text-generation-inference.
 * Fix outstanding issues with the existing code.
 * Submit issues related to bugs or desired new features.
 * Contribute to the examples or to the documentation.
 > All contributions are equally valuable to the community. 🥰
 ## Fixing outstanding issues
 If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) and open
 a Pull Request!
 ## Submitting a bug-related issue or feature request
 Do your best to follow these guidelines when submitting a bug-related issue or a feature
 request. It will make it easier for us to come back to you quickly and with good
 feedback.
 ### Did you find a bug?
 The text-generation-inference library is robust and reliable thanks to users who report the problems they encounter.
 Before you report an issue, we would really appreciate it if you could **make sure the bug was not
 already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the
 library itself, and not your code.
 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so
 we can quickly resolve it:
 * Your **OS type and version**, as well as your environment versions (versions of rust, python, and dependencies).
 * A short, self-contained, code snippet that allows us to reproduce the bug.
 * The *full* traceback if an exception is raised.
 * Attach any other additional information, like screenshots, you think may help.
 To get the OS and software versions automatically, you can re-run the launcher with the `--env` flag:
 ```bash
 text-generation-launcher --env
 ```
 This will precede the launch of the model with the information relative to your environment. We recommend pasting
 that in your issue report.
 ### Do you want a new feature?
 If there is a new feature you'd like to see in text-generation-inference, please open an issue and describe:
 1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it
   a feature related to something you need for a project? Is it something you worked on and think it could benefit
   the community?
   Whatever it is, we'd love to hear about it!
 2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better
   we'll be able to help you.
 3. Provide a *code snippet* that demonstrates the feature's usage.
 4. If the feature is related to a paper, please include a link.
 If your issue is well written we're already 80% of the way there by the time you create it.
 We have added [templates](https://github.com/huggingface/text-generation-inference/tree/main/.github/ISSUE_TEMPLATE)
 to help you get started with your issue.
 ## Do you want to implement a new model?
 New models are constantly released and if you want to implement a new model, please provide the following information:
 * A short description of the model and a link to the paper.
 * Link to the implementation if it is open-sourced.
 * Link to the model weights if they are available.
 If you are willing to contribute the model yourself, let us know so we can help you add it to text-generation-inference!
 ## Do you want to add documentation?
 We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know
 how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be
 happy to make the changes or help you make a contribution if you're interested!
 ## I want to become a maintainer of the project. How do I get there?
 TGI is a project led and managed by Hugging Face as it powers our internal services. However, we are happy to have
 motivated individuals from other organizations join us as maintainers with the goal of making TGI the best inference
 service.
 If you are such an individual (or organization), please reach out to us and let's collaborate.
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 # Rust builder
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.78 AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
 WORKDIR /usr/src
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
--- a/benchmark/src/app.rs
+++ b/benchmark/src/app.rs
@ -497,7 +497,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                "Lowest:  {:.2} {unit}",
                data.iter()
                    .min_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
@ -506,7 +506,7 @@ fn statis_spans<'a>(data: &[f64], unit: &'static str) -> Vec<Line<'a>> {
                "Highest: {:.2} {unit}",
                data.iter()
                    .max_by(|a, b| a.total_cmp(b))
-                    .unwrap_or(&std::f64::NAN)
+                    .unwrap_or(&f64::NAN)
            ),
            Style::default().fg(Color::Reset),
        )]),
@ -555,17 +555,17 @@ fn latency_throughput_chart<'a>(
    let min_latency: f64 = *latency_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_latency: f64 = *latency_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let min_throughput: f64 = *throughput_iter
        .clone()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max_throughput: f64 = *throughput_iter
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    // Char min max values
    let min_x = if zoom {
--- a/benchmark/src/table.rs
+++ b/benchmark/src/table.rs
@ -156,17 +156,17 @@ fn avg_min_max(data: &[f64]) -> (f64, f64, f64) {
    let min = data
        .iter()
        .min_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    let max = data
        .iter()
        .max_by(|a, b| a.total_cmp(b))
-        .unwrap_or(&std::f64::NAN);
+        .unwrap_or(&f64::NAN);
    (average, *min, *max)
 }
 fn px(data: &[f64], p: u32) -> f64 {
    let i = (f64::from(p) / 100.0 * data.len() as f64) as usize;
-    *data.get(i).unwrap_or(&std::f64::NAN)
+    *data.get(i).unwrap_or(&f64::NAN)
 }
 fn format_value(value: f64, unit: &'static str) -> String {
--- a/benchmark/src/utils.rs
+++ b/benchmark/src/utils.rs
@ -37,7 +37,7 @@ pub(crate) fn percentiles(values: &[f64], pecents: &[i32]) -> BTreeMap<String, f
        .iter()
        .map(|&p| {
            let i = (f64::from(p) / 100.0 * values.len() as f64) as usize;
-            (format!("p{p}"), *values.get(i).unwrap_or(&std::f64::NAN))
+            (format!("p{p}"), *values.get(i).unwrap_or(&f64::NAN))
        })
        .collect()
 }
--- a/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
+++ b/integration-tests/models/snapshots/test_flash_pali_gemma/test_flash_pali_gemma_two_images.json
@ -0,0 +1,61 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 8,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 2502,
        "logprob": -1.734375,
        "special": false,
        "text": "image"
      },
      {
        "id": 2196,
        "logprob": -0.5756836,
        "special": false,
        "text": " result"
      },
      {
        "id": 604,
        "logprob": -0.007843018,
        "special": false,
        "text": " for"
      },
      {
        "id": 12254,
        "logprob": -1.7167969,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 611,
        "logprob": -0.17053223,
        "special": false,
        "text": " on"
      },
      {
        "id": 573,
        "logprob": -0.7626953,
        "special": false,
        "text": " the"
      },
      {
        "id": 8318,
        "logprob": -0.02709961,
        "special": false,
        "text": " beach"
      },
      {
        "id": 1,
        "logprob": -0.20739746,
        "special": true,
        "text": "<eos>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": "image result for chicken on the beach"
 }
--- a/integration-tests/models/snapshots/test_idefics/test_idefics_two_images.json
+++ b/integration-tests/models/snapshots/test_idefics/test_idefics_two_images.json
@ -0,0 +1,85 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "eos_token",
    "generated_tokens": 12,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 450,
        "logprob": -0.26342773,
        "special": false,
        "text": " The"
      },
      {
        "id": 21282,
        "logprob": -0.01838684,
        "special": false,
        "text": " cow"
      },
      {
        "id": 322,
        "logprob": -0.18041992,
        "special": false,
        "text": " and"
      },
      {
        "id": 521,
        "logprob": -0.62841797,
        "special": false,
        "text": " ch"
      },
      {
        "id": 21475,
        "logprob": -0.0037956238,
        "special": false,
        "text": "icken"
      },
      {
        "id": 526,
        "logprob": -0.018737793,
        "special": false,
        "text": " are"
      },
      {
        "id": 373,
        "logprob": -1.0820312,
        "special": false,
        "text": " on"
      },
      {
        "id": 263,
        "logprob": -0.5083008,
        "special": false,
        "text": " a"
      },
      {
        "id": 25695,
        "logprob": -0.07128906,
        "special": false,
        "text": " beach"
      },
      {
        "id": 29889,
        "logprob": -0.12573242,
        "special": false,
        "text": "."
      },
      {
        "id": 32002,
        "logprob": -0.0029792786,
        "special": true,
        "text": "<end_of_utterance>"
      },
      {
        "id": 2,
        "logprob": -0.00024962425,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " The cow and chicken are on a beach."
 }
--- a/integration-tests/models/snapshots/test_idefics2/test_flash_idefics2_two_images.json
+++ b/integration-tests/models/snapshots/test_idefics2/test_flash_idefics2_two_images.json
@ -0,0 +1,133 @@
 {
  "details": {
    "best_of_sequences": null,
    "finish_reason": "length",
    "generated_tokens": 20,
    "prefill": [],
    "seed": null,
    "tokens": [
      {
        "id": 415,
        "logprob": -0.04421997,
        "special": false,
        "text": " The"
      },
      {
        "id": 12072,
        "logprob": -0.13500977,
        "special": false,
        "text": " cow"
      },
      {
        "id": 349,
        "logprob": -0.06750488,
        "special": false,
        "text": " is"
      },
      {
        "id": 6328,
        "logprob": -0.6352539,
        "special": false,
        "text": " standing"
      },
      {
        "id": 356,
        "logprob": -0.16186523,
        "special": false,
        "text": " on"
      },
      {
        "id": 272,
        "logprob": -0.5078125,
        "special": false,
        "text": " the"
      },
      {
        "id": 10305,
        "logprob": -0.017913818,
        "special": false,
        "text": " beach"
      },
      {
        "id": 304,
        "logprob": -1.5205078,
        "special": false,
        "text": " and"
      },
      {
        "id": 272,
        "logprob": -0.029174805,
        "special": false,
        "text": " the"
      },
      {
        "id": 13088,
        "logprob": -0.003479004,
        "special": false,
        "text": " chicken"
      },
      {
        "id": 349,
        "logprob": -0.0035095215,
        "special": false,
        "text": " is"
      },
      {
        "id": 6398,
        "logprob": -0.3088379,
        "special": false,
        "text": " sitting"
      },
      {
        "id": 356,
        "logprob": -0.027755737,
        "special": false,
        "text": " on"
      },
      {
        "id": 264,
        "logprob": -0.31884766,
        "special": false,
        "text": " a"
      },
      {
        "id": 17972,
        "logprob": -0.047943115,
        "special": false,
        "text": " pile"
      },
      {
        "id": 302,
        "logprob": -0.0002925396,
        "special": false,
        "text": " of"
      },
      {
        "id": 2445,
        "logprob": -0.02935791,
        "special": false,
        "text": " money"
      },
      {
        "id": 28723,
        "logprob": -0.031219482,
        "special": false,
        "text": "."
      },
      {
        "id": 32002,
        "logprob": -0.00034475327,
        "special": true,
        "text": "<end_of_utterance>"
      },
      {
        "id": 2,
        "logprob": -1.1920929e-07,
        "special": true,
        "text": "</s>"
      }
    ],
    "top_tokens": null
  },
  "generated_text": " The cow is standing on the beach and the chicken is sitting on a pile of money."
 }
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@ -22,6 +22,12 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
    return flash_pali_gemma_handle.client
 def get_chicken():
    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
@ -37,3 +43,20 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
    assert response.generated_text == "beach"
    assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
    chicken = get_chicken()
    cow_beach = get_cow_beach()
    response = await flash_pali_gemma.generate(
        f"caption![]({chicken})![]({cow_beach})\n",
        max_new_tokens=20,
    )
    # Is PaliGemma not able to handle two separate images? At least we
    # get output showing that both images are used.
    assert (
        response.generated_text == "image result for chicken on the beach"
    ), f"{repr(response.generated_text)}"
    assert response == response_snapshot
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@ -23,6 +23,12 @@ def get_chicken():
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.mark.asyncio
 async def test_idefics(idefics, response_snapshot):
    chicken = get_chicken()
@ -39,6 +45,21 @@ async def test_idefics(idefics, response_snapshot):
    assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
 async def test_idefics_two_images(idefics, response_snapshot):
    chicken = get_chicken()
    cow_beach = get_cow_beach()
    response = await idefics.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
    )
    assert (
        response.generated_text == " The cow and chicken are on a beach."
    ), f"{repr(response.generated_text)}"
    assert response == response_snapshot
@pytest.mark.asyncio
 async def test_idefics_load(idefics, generate_load, response_snapshot):
    chicken = get_chicken()
--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@ -9,6 +9,12 @@ def get_chicken():
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 def get_cow_beach():
    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
@pytest.fixture(scope="module")
 def flash_idefics2_next_handle(launcher):
    with launcher(
@ -38,6 +44,23 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
    assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
    chicken = get_chicken()
    cow_beach = get_cow_beach()
    response = await flash_idefics2_next.generate(
        f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
        max_new_tokens=20,
    )
    assert (
        response.generated_text
        == " The cow is standing on the beach and the chicken is sitting on a pile of money."
    ), f"{repr(response.generated_text)}"
    assert response.details.generated_tokens == 20
    assert response == response_snapshot
@pytest.mark.asyncio
@pytest.mark.private
 async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snapshot):
--- a/router/client/build.rs
+++ b/router/client/build.rs
@ -1,7 +1,7 @@
 use std::fs;
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    println!("cargo:rerun-if-changed=../../proto/**");
+    println!("cargo:rerun-if-changed=../../proto/");
    fs::create_dir_all("src/v2/pb").unwrap_or(());
    let mut config = prost_build::Config::new();
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@ -1,5 +1,5 @@
 [toolchain]
-# Released on: 02 May, 2024
+# Released on: June 13, 2024
-# https://releases.rs/docs/1.78.0/
+# https://releases.rs/docs/1.79.0/
-channel = "1.78.0"
+channel = "1.79.0"
 components = ["rustfmt", "clippy"]
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@ -53,7 +53,9 @@ def image_text_replacement(image_input, config, image_id) -> str:
        num_features = get_number_of_features(height, width, config)
        from loguru import logger
-        logger.info(f"Found {num_features} in image of resolution {height}x{width}")
+        logger.info(
            f"Found {num_features} features in image of resolution {height}x{width}"
        )
        return "<image>" * num_features
    elif config.model_type == "paligemma":
@ -133,23 +135,41 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
    def batch_tokenized_inputs(
        cls, requests: Iterable[generate_pb2.Request], tokenizer, processor, config
    ):
        # Process images first. We need all of them so that the processor
        # can make the image splits the same size. And we need the final
        # sizes to insert correct number of image tokens.
        images = []
        for r in requests:
            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    pass
                elif chunk_type == "image":
                    image = Image.open(BytesIO(chunk.image.data))
                    if config.model_type == "llava_next":
                        images.append(image)
                    else:
                        images.append([image])
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
        if images:
            image_inputs = processor.image_processor(images, return_tensors="pt")
        else:
            image_inputs = None
        batch_inputs = []
        image_inputs = []
        max_truncation = 0
        image_id = 0
        for r in requests:
            full_text = ""
            image_id = 0
            for chunk in r.input_chunks.chunks:
                chunk_type = chunk.WhichOneof("chunk")
                if chunk_type == "text":
                    full_text += chunk.text
                elif chunk_type == "image":
-                    image = Image.open(BytesIO(chunk.image.data))
+                    full_text += image_text_replacement(image_inputs, config, image_id)
-                    image_input = processor.image_processor(image, return_tensors="pt")
+                    image_id += 1
                    full_text += image_text_replacement(image_input, config, image_id)
                    image_inputs.append(image_input)
                else:
                    raise RuntimeError(f"Invalid chunk type {chunk_type}")
            batch_inputs.append(full_text)
            max_truncation = max(max_truncation, r.truncate)
@ -160,24 +180,7 @@ class VlmCausalLMBatch(FlashCausalLMBatch):
            max_length=max_truncation,
            add_special_tokens=not config.model_type == "paligemma",
        )["input_ids"]
-        if image_inputs:
+
            image_input = image_inputs[0]
            new_image_inputs = {
                "pixel_values": torch.cat(
                    [img["pixel_values"] for img in image_inputs], dim=0
                ),
            }
            if "pixel_attention_mask" in image_input:
                new_image_inputs["pixel_attention_mask"] = torch.cat(
                    [img["pixel_attention_mask"] for img in image_inputs], dim=0
                )
            if "image_sizes" in image_input:
                new_image_inputs["image_sizes"] = torch.cat(
                    [img["image_sizes"] for img in image_inputs], dim=0
                )
            image_inputs = new_image_inputs
        else:
            image_inputs = None
        return batch_tokenized_inputs, image_inputs
    @classmethod
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@ -270,7 +270,11 @@ def serve(
            interceptors=[
                ExceptionInterceptor(),
                UDSOpenTelemetryAioServerInterceptor(),
-            ]
+            ],
            options=[
                # Set the maximum possible message length: i32::MAX
                ("grpc.max_receive_message_length", (1 << 31) - 1)
            ],
        )
        generate_pb2_grpc.add_TextGenerationServiceServicer_to_server(
            TextGenerationService(model, Cache(), quantize, server_urls), server