From dae3bf1d87c13f1186eead4afbfb4a29e4a59f84 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 11 Sep 2024 22:41:56 +0200
Subject: [PATCH] Fix tokenization yi (#2507)

* Fixing odd tokenization self modifications on the Rust side (load and
resave in Python).

* Fixing the builds ?

* Fix the gh action?

* Fixing the location ?

* Validation is odd.

* Try a faster runner

* Upgrade python version.

* Remove sccache

* No sccache.

* Getting libpython maybe ?

* List stuff.

* Monkey it up.

* have no idea at this point

* Tmp.

* Shot in the dark.

* Tmate the hell out of this.

* Desperation.

* WTF.

* -y.

* Apparently 3.10 is not available anymore.

* Updating the dockerfile to make libpython discoverable at runtime too.

* Put back rust tests.

* Why do we want mkl on AMD ?

* Forcing 3.11 ?
---
 .github/workflows/tests.yaml                  |  40 +-----
 Cargo.lock                                    | 120 +++++++++++++++++-
 Cargo.toml                                    |   2 +-
 Dockerfile                                    |  40 +++---
 Dockerfile_amd                                |  27 ++--
 Dockerfile_intel                              |   5 +-
 backends/v3/src/queue.rs                      |   1 +
 router/Cargo.toml                             |   1 +
 router/src/server.rs                          | 113 +++++------------
 server/tests/models/test_bloom.py             |   2 +-
 server/tests/models/test_causal_lm.py         |   2 +-
 server/tests/models/test_seq2seq_lm.py        |   2 +-
 .../models/flash_causal_lm.py                 |   2 +
 13 files changed, 205 insertions(+), 152 deletions(-)

diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 6faabe3b..5ad0fd6a 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -17,19 +17,15 @@ concurrency:
 
 jobs:
   run_tests:
-    runs-on: ubuntu-latest
-
-    env:
-      SCCACHE_GHA_ENABLED: "on"
-      RUSTC_WRAPPER: /usr/local/bin/sccache
-      SCCACHE: 0.3.3
-
+    runs-on:
+      group: aws-highmemory-32-plus-priv
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v4
+        id: python
         with:
-          python-version: 3.9
+          python-version: 3.11
       - name: Install Rust
         uses: actions-rs/toolchain@v1
         with:
@@ -44,30 +40,9 @@ jobs:
         run: |
           sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
           sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
-      - name: Install sccache
-        run: |
-          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
-          chmod +x /usr/local/bin/sccache
-      - name: configure sccache
-        uses: actions/github-script@v6
-        with:
-          script: |
-            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-            core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}');
-            core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-');
-      - name: cargo registry cache
-        uses: actions/cache@v3
-        with:
-          key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }}
-          restore-keys: |
-            cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-
-            cargo-${{ runner.os }}-
-          path: |
-            ~/.cargo/registry
-            ~/.cargo/git
       - name: Install
         run: |
+          sudo apt install python3.11-dev -y
           make install-cpu
       - name: Run server tests
         run: |
@@ -82,6 +57,3 @@ jobs:
       - name: Run Rust tests
         run: |
           cargo test
-      - name: sccache stats
-        run: |
-          /usr/local/bin/sccache --show-stats
diff --git a/Cargo.lock b/Cargo.lock
index 00c7f005..06a61853 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2118,6 +2118,15 @@ version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
+[[package]]
+name = "memoffset"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "metrics"
 version = "0.23.0"
@@ -3112,6 +3121,69 @@ dependencies = [
  "prost 0.12.6",
 ]
 
+[[package]]
+name = "pyo3"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "831e8e819a138c36e212f3af3fd9eeffed6bf1510a805af35b0edee5ffa59433"
+dependencies = [
+ "cfg-if",
+ "indoc",
+ "libc",
+ "memoffset",
+ "once_cell",
+ "portable-atomic",
+ "pyo3-build-config",
+ "pyo3-ffi",
+ "pyo3-macros",
+ "unindent",
+]
+
+[[package]]
+name = "pyo3-build-config"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e8730e591b14492a8945cdff32f089250b05f5accecf74aeddf9e8272ce1fa8"
+dependencies = [
+ "once_cell",
+ "target-lexicon",
+]
+
+[[package]]
+name = "pyo3-ffi"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e97e919d2df92eb88ca80a037969f44e5e70356559654962cbb3316d00300c6"
+dependencies = [
+ "libc",
+ "pyo3-build-config",
+]
+
+[[package]]
+name = "pyo3-macros"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eb57983022ad41f9e683a599f2fd13c3664d7063a3ac5714cae4b7bee7d3f206"
+dependencies = [
+ "proc-macro2",
+ "pyo3-macros-backend",
+ "quote",
+ "syn 2.0.76",
+]
+
+[[package]]
+name = "pyo3-macros-backend"
+version = "0.22.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec480c0c51ddec81019531705acac51bcdbeae563557c982aa8263bb96880372"
+dependencies = [
+ "heck 0.5.0",
+ "proc-macro2",
+ "pyo3-build-config",
+ "quote",
+ "syn 2.0.76",
+]
+
 [[package]]
 name = "qoi"
 version = "0.4.1"
@@ -4068,7 +4140,7 @@ dependencies = [
  "pkg-config",
  "text-generation-router",
  "thiserror",
- "tokenizers",
+ "tokenizers 0.19.1",
  "tokio",
  "tokio-stream",
  "tracing",
@@ -4091,7 +4163,7 @@ dependencies = [
  "tabled",
  "text-generation-client",
  "thiserror",
- "tokenizers",
+ "tokenizers 0.20.0",
  "tokio",
  "tracing",
  "tracing-subscriber",
@@ -4161,6 +4233,7 @@ dependencies = [
  "once_cell",
  "opentelemetry 0.20.0",
  "opentelemetry-otlp",
+ "pyo3",
  "rand",
  "regex",
  "reqwest",
@@ -4168,7 +4241,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "thiserror",
- "tokenizers",
+ "tokenizers 0.20.0",
  "tokio",
  "tokio-stream",
  "tower-http",
@@ -4219,7 +4292,7 @@ dependencies = [
  "slotmap",
  "text-generation-router",
  "thiserror",
- "tokenizers",
+ "tokenizers 0.20.0",
  "tokio",
  "tokio-stream",
  "tonic 0.10.2",
@@ -4374,6 +4447,39 @@ dependencies = [
  "unicode_categories",
 ]
 
+[[package]]
+name = "tokenizers"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8a24d7f7d6be5b9d1377418b893ab1808af0074f5d1bb2c64784452ddd2aa70"
+dependencies = [
+ "aho-corasick",
+ "derive_builder",
+ "esaxx-rs",
+ "getrandom",
+ "hf-hub",
+ "indicatif",
+ "itertools 0.12.1",
+ "lazy_static",
+ "log",
+ "macro_rules_attribute",
+ "monostate",
+ "onig",
+ "paste",
+ "rand",
+ "rayon",
+ "rayon-cond",
+ "regex",
+ "regex-syntax 0.8.4",
+ "serde",
+ "serde_json",
+ "spm_precompiled",
+ "thiserror",
+ "unicode-normalization-alignments",
+ "unicode-segmentation",
+ "unicode_categories",
+]
+
 [[package]]
 name = "tokio"
 version = "1.39.3"
@@ -4839,6 +4945,12 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 
+[[package]]
+name = "unindent"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
+
 [[package]]
 name = "untrusted"
 version = "0.7.1"
diff --git a/Cargo.toml b/Cargo.toml
index 79fda15d..a50bba24 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,7 +25,7 @@ homepage = "https://github.com/huggingface/text-generation-inference"
 
 [workspace.dependencies]
 base64 = "0.22.0"
-tokenizers = { version = "0.19.1", features = ["http"] }
+tokenizers = { version = "0.20.0", features = ["http"] }
 hf-hub = { version = "0.3.1", features = ["tokio"] }
 metrics = { version = "0.23.0" }
 metrics-exporter-prometheus = { version = "0.15.1", features = [] }
diff --git a/Dockerfile b/Dockerfile
index 0d0e89b1..37ced781 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,10 +13,13 @@ COPY benchmark benchmark
 COPY router router
 COPY backends backends
 COPY launcher launcher
+
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -37,6 +40,7 @@ COPY router router
 COPY backends backends
 COPY launcher launcher
 RUN cargo build --profile release-opt
+RUN cargo build --profile release-opt
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
@@ -45,7 +49,7 @@ FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS pytorch-install
 # NOTE: When updating PyTorch version, beware to remove `pip install nvidia-nccl-cu12==2.22.3` below in the Dockerfile. Context: https://github.com/huggingface/text-generation-inference/pull/2099
 ARG PYTORCH_VERSION=2.4.0
 
-ARG PYTHON_VERSION=3.10
+ARG PYTHON_VERSION=3.11
 # Keep in sync with `server/pyproject.toml
 ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
@@ -216,33 +220,33 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 COPY --from=pytorch-install /opt/conda /opt/conda
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /opt/conda/lib/python3.10/site-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /opt/conda/lib/python3.11/site-packages/flash_attn_2_cuda.cpython-311-x86_64-linux-gnu.so /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/exllamav2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from eetq kernels builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from lorax punica kernels builder
-COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=lorax-punica-builder /usr/src/lorax-punica/server/punica_kernels/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from fbgemm builder
-COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.10/cmake-install /opt/conda/lib/python3.10/site-packages
+COPY --from=fbgemm-builder /usr/src/fbgemm/fbgemm_gpu/_skbuild/linux-x86_64-3.11/cmake-install /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 # Copy build artifacts from mamba builder
-COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
-COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-310/ /opt/conda/lib/python3.10/site-packages
-COPY --from=flashinfer-builder /opt/conda/lib/python3.10/site-packages/flashinfer/ /opt/conda/lib/python3.10/site-packages/flashinfer/
+COPY --from=mamba-builder /usr/src/mamba/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=mamba-builder /usr/src/causal-conv1d/build/lib.linux-x86_64-cpython-311/ /opt/conda/lib/python3.11/site-packages
+COPY --from=flashinfer-builder /opt/conda/lib/python3.11/site-packages/flashinfer/ /opt/conda/lib/python3.11/site-packages/flashinfer/
 
 # Install flash-attention dependencies
 RUN pip install einops --no-cache-dir
@@ -257,7 +261,9 @@ RUN cd server && \
     pip install ".[bnb, accelerate, marlin, quantize, peft, outlines]" --no-cache-dir && \
     pip install nvidia-nccl-cu12==2.22.3
 
-ENV LD_PRELOAD=/opt/conda/lib/python3.10/site-packages/nvidia/nccl/lib/libnccl.so.2
+ENV LD_PRELOAD=/opt/conda/lib/python3.11/site-packages/nvidia/nccl/lib/libnccl.so.2
+# Required to find libpython within the rust binaries
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 # This is needed because exl2 tries to load flash-attn
 # And fails with our builds.
 ENV EXLLAMA_NO_FLASH_ATTN=1
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 8cb699dd..a79aae48 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -17,6 +17,8 @@ RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -64,14 +66,14 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     hipsolver-dev \
     rccl-dev \
     cmake \
-    python3-dev && \
+    python3.11-dev && \
     rm -rf /var/lib/apt/lists/*
 
 # Keep in sync with `server/pyproject.toml
 ARG MAMBA_VERSION=23.1.0-1
 ARG PYTORCH_VERSION='2.3.0'
 ARG ROCM_VERSION='6.0.2'
-ARG PYTHON_VERSION='3.10.10'
+ARG PYTHON_VERSION='3.11.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
 ENV PATH /opt/conda/bin:$PATH
@@ -89,10 +91,18 @@ RUN chmod +x ~/mambaforge.sh && \
     mamba init && \
     rm ~/mambaforge.sh
 
+# RUN conda install intel::mkl-static intel::mkl-include
+# Install pytorch
+# On arm64 we exit with an error code
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  exit 1 ;; \
+         *)              /opt/conda/bin/conda update -y conda &&  \
+                         /opt/conda/bin/conda install -y "python=${PYTHON_VERSION}" ;; \
+    esac && \
+    /opt/conda/bin/conda clean -ya
 # Install flash-attention, torch dependencies
 RUN pip install numpy einops ninja --no-cache-dir
 
-RUN conda install intel::mkl-static intel::mkl-include
 RUN pip uninstall -y triton && \
     git clone --depth 1 --single-branch https://github.com/ROCm/triton.git && \
     cd triton/python && \
@@ -172,19 +182,19 @@ ENV HF_HOME=/data \
     PORT=80
 
 # Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /opt/conda/lib/python3.11/site-packages
 
 # Install server
 COPY proto proto
@@ -201,6 +211,7 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-benchmark /usr/l
 COPY --from=builder /usr/src/target/release-opt/text-generation-router /usr/local/bin/text-generation-router
 # Install launcher
 COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/local/bin/text-generation-launcher
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 
 # AWS Sagemaker compatible image
 FROM base AS sagemaker
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 0cda4d4b..d7ac09af 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -18,6 +18,8 @@ RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -114,7 +116,7 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
 ARG MAMBA_VERSION=23.1.0-1
-ARG PYTHON_VERSION='3.10.10'
+ARG PYTHON_VERSION='3.11.10'
 # Automatically set by buildx
 ARG TARGETPLATFORM
 ENV PATH /opt/conda/bin:$PATH
@@ -153,6 +155,7 @@ ENV CCL_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
 ENV I_MPI_ROOT=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch
 ENV FI_PROVIDER_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric
 ENV LD_LIBRARY_PATH=/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/opt/mpi/libfabric/lib:/opt/conda/lib/python3.10/site-packages/oneccl_bindings_for_pytorch/lib
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib/"
 
 # Install server
 COPY proto proto
diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index 978a495c..2bbb6753 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -357,6 +357,7 @@ impl State {
             let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) =
                 (block_allocation, &self.block_allocator)
             {
+                tracing::debug!("Allocating {tokens} with {input_ids:?}");
                 match block_allocator.allocate(tokens, input_ids).await {
                     None => {
                         // Entry is over budget
diff --git a/router/Cargo.toml b/router/Cargo.toml
index 5c328e8a..6a752db6 100644
--- a/router/Cargo.toml
+++ b/router/Cargo.toml
@@ -61,6 +61,7 @@ uuid = { version = "1.9.1", default-features = false, features = [
 ] }
 csv = "1.3.0"
 ureq = "=2.9"
+pyo3 = { version = "0.22.2", features = ["auto-initialize"] }
 
 
 [build-dependencies]
diff --git a/router/src/server.rs b/router/src/server.rs
index 6a04ab00..8bd49b93 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -41,6 +41,7 @@ use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
 use hf_hub::{Cache, Repo, RepoType};
 use http::header::AUTHORIZATION;
 use metrics_exporter_prometheus::{Matcher, PrometheusBuilder, PrometheusHandle};
+use pyo3::types::IntoPyDict;
 use serde_json::Value;
 use std::convert::Infallible;
 use std::fs::File;
@@ -48,7 +49,6 @@ use std::io::BufReader;
 use std::net::{IpAddr, Ipv4Addr, SocketAddr};
 use std::path::{Path, PathBuf};
 use thiserror::Error;
-use tokenizers::processors::template::TemplateProcessing;
 use tokenizers::Tokenizer;
 use tokio::select;
 use tokio::signal;
@@ -1860,18 +1860,34 @@ pub async fn run(
     });
 
     let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
-        let mut tokenizer = Tokenizer::from_file(filename).ok();
-        if let Some(tokenizer) = &mut tokenizer {
-            if let Some(class) = &tokenizer_config.tokenizer_class {
-                if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{
-                    if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) {
-                        tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205");
-                        tokenizer.with_post_processor(post_processor);
-                    }
-                }
-            }
-        }
-        tokenizer
+        use pyo3::prelude::*;
+        let convert = pyo3::Python::with_gil(|py| -> PyResult<()> {
+            let transformers = py.import_bound("transformers")?;
+            let auto = transformers.getattr("AutoTokenizer")?;
+            let from_pretrained = auto.getattr("from_pretrained")?;
+            let args = (tokenizer_name.to_string(),);
+            let kwargs = [(
+                "revision",
+                revision.clone().unwrap_or_else(|| "main".to_string()),
+            )]
+            .into_py_dict_bound(py);
+            let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
+            let save = tokenizer.getattr("save_pretrained")?;
+            let args = ("out".to_string(),);
+            save.call1(args)?;
+            Ok(())
+        })
+        .inspect_err(|err| {
+            tracing::error!("Failed to import python tokenizer {err}");
+        });
+        let filename = if convert.is_ok() {
+            // If we have correctly loaded and resaved with transformers
+            // We might have modified the tokenizer.json according to transformers
+            "out/tokenizer.json".into()
+        } else {
+            filename
+        };
+        Tokenizer::from_file(filename).ok()
     });
 
     let config: Option<Config> = config_filename.and_then(|filename| {
@@ -2591,77 +2607,6 @@ pub enum WebServerError {
     Axum(#[from] axum::BoxError),
 }
 
-/// Create a post_processor for the LlamaTokenizer
-fn create_post_processor(
-    tokenizer: &Tokenizer,
-    tokenizer_config: &HubTokenizerConfig,
-) -> Result<TemplateProcessing, tokenizers::processors::template::TemplateProcessingBuilderError> {
-    let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true);
-    let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false);
-
-    let bos_token = tokenizer_config.bos_token.as_ref();
-    let eos_token = tokenizer_config.eos_token.as_ref();
-
-    if add_bos_token && bos_token.is_none() {
-        panic!("add_bos_token = true but bos_token is None");
-    }
-
-    if add_eos_token && eos_token.is_none() {
-        panic!("add_eos_token = true but eos_token is None");
-    }
-
-    let mut single = Vec::new();
-    let mut pair = Vec::new();
-    let mut special_tokens = Vec::new();
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            let bos_token_id = tokenizer
-                .token_to_id(bos.as_str())
-                .expect("Should have found the bos token id");
-            special_tokens.push((bos.as_str(), bos_token_id));
-            single.push(format!("{}:0", bos.as_str()));
-            pair.push(format!("{}:0", bos.as_str()));
-        }
-    }
-
-    single.push("$A:0".to_string());
-    pair.push("$A:0".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            let eos_token_id = tokenizer
-                .token_to_id(eos.as_str())
-                .expect("Should have found the eos token id");
-            special_tokens.push((eos.as_str(), eos_token_id));
-            single.push(format!("{}:0", eos.as_str()));
-            pair.push(format!("{}:0", eos.as_str()));
-        }
-    }
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            pair.push(format!("{}:1", bos.as_str()));
-        }
-    }
-
-    pair.push("$B:1".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            pair.push(format!("{}:1", eos.as_str()));
-        }
-    }
-
-    let post_processor = TemplateProcessing::builder()
-        .try_single(single)?
-        .try_pair(pair)?
-        .special_tokens(special_tokens)
-        .build()?;
-
-    Ok(post_processor)
-}
-
 type PreparedInput = (String, Option<GrammarType>, bool);
 
 fn prepare_chat_input(
diff --git a/server/tests/models/test_bloom.py b/server/tests/models/test_bloom.py
index 08292920..dff5239b 100644
--- a/server/tests/models/test_bloom.py
+++ b/server/tests/models/test_bloom.py
@@ -267,7 +267,7 @@ def test_batch_concatenate(
     assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/tests/models/test_causal_lm.py b/server/tests/models/test_causal_lm.py
index c000ef26..11ca5efe 100644
--- a/server/tests/models/test_causal_lm.py
+++ b/server/tests/models/test_causal_lm.py
@@ -262,7 +262,7 @@ def test_batch_concatenate(
     assert next_batch.max_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/tests/models/test_seq2seq_lm.py b/server/tests/models/test_seq2seq_lm.py
index 02666042..5ba7c64c 100644
--- a/server/tests/models/test_seq2seq_lm.py
+++ b/server/tests/models/test_seq2seq_lm.py
@@ -281,7 +281,7 @@ def test_batch_concatenate(
     assert next_batch.max_decoder_input_length == 3
 
     assert next_batch.requests[0] == next_batch_0.requests[0]
-    assert next_batch.requests[1:] == next_batch_1.requests
+    assert next_batch.requests[1:] == list(next_batch_1.requests)
 
     assert next_batch.next_token_choosers[0] == next_batch_0.next_token_choosers[0]
     assert next_batch.next_token_choosers[1:] == next_batch_1.next_token_choosers
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 65180499..834056aa 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -272,6 +272,8 @@ class FlashCausalLMBatch(Batch):
                 assert prefix_len > 0
                 prefix_len -= 1
 
+            # Commented as it's costly.
+            # log_master(logger.debug, "Tokenized input ids {tokenized_input}")
             prefix_ids.append(tokenized_input[:prefix_len])
             tokenized_input = tokenized_input[prefix_len:]